[llvm] 0819a64 - [SelectionDAG] Better legalization for FSHL and FSHR
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 21 02:33:42 PDT 2020
Author: Jay Foad
Date: 2020-08-21T10:32:49+01:00
New Revision: 0819a6416fd217413a1d04e93531db1b30272e9b
URL: https://github.com/llvm/llvm-project/commit/0819a6416fd217413a1d04e93531db1b30272e9b
DIFF: https://github.com/llvm/llvm-project/commit/0819a6416fd217413a1d04e93531db1b30272e9b.diff
LOG: [SelectionDAG] Better legalization for FSHL and FSHR
In SelectionDAGBuilder always translate the fshl and fshr intrinsics to
FSHL and FSHR (or ROTL and ROTR) instead of lowering them to shifts and
ORs. Improve the legalization of FSHL and FSHR to avoid code quality
regressions.
Differential Revision: https://reviews.llvm.org/D77152
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
llvm/lib/Target/RISCV/RISCVInstrInfoB.td
llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
llvm/test/CodeGen/AArch64/funnel-shift.ll
llvm/test/CodeGen/AArch64/shift-by-signext.ll
llvm/test/CodeGen/AMDGPU/fshl.ll
llvm/test/CodeGen/AMDGPU/fshr.ll
llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
llvm/test/CodeGen/PowerPC/funnel-shift.ll
llvm/test/CodeGen/RISCV/rv32Zbbp.ll
llvm/test/CodeGen/RISCV/rv32Zbt.ll
llvm/test/CodeGen/RISCV/rv64Zbbp.ll
llvm/test/CodeGen/RISCV/rv64Zbt.ll
llvm/test/CodeGen/X86/fshl.ll
llvm/test/CodeGen/X86/fshr.ll
llvm/test/CodeGen/X86/funnel-shift.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshl-512.ll
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6eaf28096be2..77a79a0479ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -207,6 +207,16 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::FREEZE:
Res = PromoteIntRes_FREEZE(N);
break;
+
+ case ISD::ROTL:
+ case ISD::ROTR:
+ Res = PromoteIntRes_Rotate(N);
+ break;
+
+ case ISD::FSHL:
+ case ISD::FSHR:
+ Res = PromoteIntRes_FunnelShift(N);
+ break;
}
// If the result is null then the sub-method took care of registering it.
@@ -1105,6 +1115,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
}
+SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
+ // Lower the rotate to shifts and ORs which can be promoted.
+ SDValue Res;
+ TLI.expandROT(N, Res, DAG);
+ ReplaceValueWith(SDValue(N, 0), Res);
+ return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
+ SDValue Hi = GetPromotedInteger(N->getOperand(0));
+ SDValue Lo = GetPromotedInteger(N->getOperand(1));
+ SDValue Amount = GetPromotedInteger(N->getOperand(2));
+
+ unsigned OldBits = N->getOperand(0).getScalarValueSizeInBits();
+ unsigned NewBits = Hi.getScalarValueSizeInBits();
+
+ // Shift Lo up to occupy the upper bits of the promoted type.
+ SDLoc DL(N);
+ EVT VT = Lo.getValueType();
+ Lo = DAG.getNode(ISD::SHL, DL, VT, Lo,
+ DAG.getConstant(NewBits - OldBits, DL, VT));
+
+ // Amount has to be interpreted modulo the old bit width.
+ Amount =
+ DAG.getNode(ISD::UREM, DL, VT, Amount, DAG.getConstant(OldBits, DL, VT));
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::FSHR) {
+ // Increase Amount to shift the result into the lower bits of the promoted
+ // type.
+ Amount = DAG.getNode(ISD::ADD, DL, VT, Amount,
+ DAG.getConstant(NewBits - OldBits, DL, VT));
+ }
+
+ return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amount);
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue Res;
@@ -2059,6 +2106,16 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN: ExpandIntRes_VECREDUCE(N, Lo, Hi); break;
+
+ case ISD::ROTL:
+ case ISD::ROTR:
+ ExpandIntRes_Rotate(N, Lo, Hi);
+ break;
+
+ case ISD::FSHL:
+ case ISD::FSHR:
+ ExpandIntRes_FunnelShift(N, Lo, Hi);
+ break;
}
// If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -3895,6 +3952,22 @@ void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N,
SplitInteger(Res, Lo, Hi);
}
+void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N,
+ SDValue &Lo, SDValue &Hi) {
+ // Lower the rotate to shifts and ORs which can be expanded.
+ SDValue Res;
+ TLI.expandROT(N, Res, DAG);
+ SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N,
+ SDValue &Lo, SDValue &Hi) {
+ // Lower the funnel shift to shifts and ORs which can be expanded.
+ SDValue Res;
+ TLI.expandFunnelShift(N, Res, DAG);
+ SplitInteger(Res, Lo, Hi);
+}
+
//===----------------------------------------------------------------------===//
// Integer Operand Expansion
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1daa907bbf01..364d0bb12365 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -347,6 +347,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
SDValue PromoteIntRes_VECREDUCE(SDNode *N);
SDValue PromoteIntRes_ABS(SDNode *N);
+ SDValue PromoteIntRes_Rotate(SDNode *N);
+ SDValue PromoteIntRes_FunnelShift(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -449,6 +451,9 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_VECREDUCE (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_Rotate (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi);
+
void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
SDValue &Lo, SDValue &Hi);
bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 353c2d7893f7..965d4a0955fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -149,6 +149,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
R = ScalarizeVecRes_BinOp(N);
break;
case ISD::FMA:
+ case ISD::FSHL:
+ case ISD::FSHR:
R = ScalarizeVecRes_TernaryOp(N);
break;
@@ -946,9 +948,13 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::USUBSAT:
case ISD::SSHLSAT:
case ISD::USHLSAT:
+ case ISD::ROTL:
+ case ISD::ROTR:
SplitVecRes_BinOp(N, Lo, Hi);
break;
case ISD::FMA:
+ case ISD::FSHL:
+ case ISD::FSHR:
SplitVecRes_TernaryOp(N, Lo, Hi);
break;
@@ -2926,6 +2932,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
Res = WidenVecRes_Unary(N);
break;
case ISD::FMA:
+ case ISD::FSHL:
+ case ISD::FSHR:
Res = WidenVecRes_Ternary(N);
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 80a2db2a6b24..9e57fa084ad8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6252,62 +6252,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
SDValue Y = getValue(I.getArgOperand(1));
SDValue Z = getValue(I.getArgOperand(2));
EVT VT = X.getValueType();
- SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
- SDValue Zero = DAG.getConstant(0, sdl, VT);
- SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
-
- // When X == Y, this is rotate. If the data type has a power-of-2 size, we
- // avoid the select that is necessary in the general case to filter out
- // the 0-shift possibility that leads to UB.
- if (X == Y && isPowerOf2_32(VT.getScalarSizeInBits())) {
- auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
- if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
- setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
- return;
- }
-
- // Some targets only rotate one way. Try the opposite direction.
- RotateOpcode = IsFSHL ? ISD::ROTR : ISD::ROTL;
- if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
- // Negate the shift amount because it is safe to ignore the high bits.
- SDValue NegShAmt = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
- setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, NegShAmt));
- return;
- }
-
- // fshl (rotl): (X << (Z % BW)) | (X >> ((0 - Z) % BW))
- // fshr (rotr): (X << ((0 - Z) % BW)) | (X >> (Z % BW))
- SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
- SDValue NShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
- SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : NShAmt);
- SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, X, IsFSHL ? NShAmt : ShAmt);
- setValue(&I, DAG.getNode(ISD::OR, sdl, VT, ShX, ShY));
- return;
- }
- auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
- if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
+ if (X == Y) {
+ auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
+ setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
+ } else {
+ auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
- return;
}
-
- // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
- // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
- SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt);
- SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
- SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
- SDValue Or = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
-
- // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
- // and that is undefined. We must compare and select to avoid UB.
- EVT CCVT = MVT::i1;
- if (VT.isVector())
- CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
-
- // For fshl, 0-shift returns the 1st arg (X).
- // For fshr, 0-shift returns the 2nd arg (Y).
- SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
- setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
return;
}
case Intrinsic::sadd_sat: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 96a4354f6ba0..8231a398d192 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6156,6 +6156,18 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
EVT ShVT = Z.getValueType();
+ assert(isPowerOf2_32(BW) && "Expecting the type bitwidth to be a power of 2");
+
+ // If a funnel shift in the other direction is more supported, use it.
+ unsigned RevOpcode = IsFSHL ? ISD::FSHR : ISD::FSHL;
+ if (!isOperationLegalOrCustom(Node->getOpcode(), VT) &&
+ isOperationLegalOrCustom(RevOpcode, VT)) {
+ SDValue Zero = DAG.getConstant(0, DL, ShVT);
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Z);
+ Result = DAG.getNode(RevOpcode, DL, VT, X, Y, Sub);
+ return true;
+ }
+
SDValue ShX, ShY;
SDValue ShAmt, InvShAmt;
if (isNonZeroModBitWidth(Z, BW)) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 7570385e38e3..6ce084a11431 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -124,25 +124,37 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
case ISD::SRL: {
if (!Subtarget->is64Bit())
break;
- SDValue Op0 = Node->getOperand(0);
- SDValue Op1 = Node->getOperand(1);
+ SDNode *Op0 = Node->getOperand(0).getNode();
uint64_t Mask;
// Match (srl (and val, mask), imm) where the result would be a
// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
// is equivalent to this (SimplifyDemandedBits may have removed lower bits
// from the mask that aren't necessary due to the right-shifting).
- if (Op1.getOpcode() == ISD::Constant &&
- isConstantMask(Op0.getNode(), Mask)) {
- uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();
+ if (isa<ConstantSDNode>(Node->getOperand(1)) && isConstantMask(Op0, Mask)) {
+ uint64_t ShAmt = Node->getConstantOperandVal(1);
if ((Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
SDValue ShAmtVal =
CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
- CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
+ CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0->getOperand(0),
ShAmtVal);
return;
}
}
+ // Match (srl (shl val, 32), imm).
+ if (Op0->getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(Op0->getOperand(1)) &&
+ isa<ConstantSDNode>(Node->getOperand(1))) {
+ uint64_t ShlAmt = Op0->getConstantOperandVal(1);
+ uint64_t SrlAmt = Node->getConstantOperandVal(1);
+ if (ShlAmt == 32 && SrlAmt > 32) {
+ SDValue SrlAmtSub32Val =
+ CurDAG->getTargetConstant(SrlAmt - 32, SDLoc(Node), XLenVT);
+ CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0->getOperand(0),
+ SrlAmtSub32Val);
+ return;
+ }
+ }
break;
}
case RISCVISD::READ_CYCLE_WIDE:
@@ -459,55 +471,6 @@ bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
return false;
}
-// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
-// We first check that it is the right node tree:
-//
-// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
-// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 == maskLeadingOnes<uint32_t>(VC2)
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 a 32 bit mask of (32 - VC1) leading ones.
-
-bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
- SDValue &Shamt) {
- if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- Subtarget->getXLenVT() == MVT::i64 &&
- cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
- if (N.getOperand(0).getOpcode() == ISD::OR) {
- SDValue Or = N.getOperand(0);
- if (Or.getOperand(0).getOpcode() == ISD::SHL &&
- Or.getOperand(1).getOpcode() == ISD::SRL) {
- SDValue Shl = Or.getOperand(0);
- SDValue Srl = Or.getOperand(1);
- if (Srl.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Srl.getOperand(0);
- if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
- isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(And.getOperand(1))) {
- uint32_t VC1 = Srl.getConstantOperandVal(1);
- uint32_t VC2 = Shl.getConstantOperandVal(1);
- uint32_t VC3 = And.getConstantOperandVal(1);
- if (VC2 == (32 - VC1) &&
- VC3 == maskLeadingOnes<uint32_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- RS2 = And.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- }
- }
- return false;
-}
-
// Merge an ADDI into the offset of a load/store instruction where possible.
// (load (addi base, off1), off2) -> (load base, off1+off2)
// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 0ca12510a230..bc1655b673d7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -52,7 +52,6 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
// Include the pieces autogenerated from the target description.
#include "RISCVGenDAGISel.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index afac509f743d..c85ab7996fcc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -14,7 +14,7 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Operand definitions.
+// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
@@ -40,6 +40,12 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
}];
}
+// Return an immediate value minus 32.
+def ImmSub32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() - 32, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -643,7 +649,6 @@ def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
def SLOIWPat : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
def SROIWPat : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
def RORIWPat : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
-def FSRIWPat : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
let Predicates = [HasStdExtZbbOrZbp] in {
def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
@@ -1019,28 +1024,21 @@ def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt, IsRV64] in {
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
- (i64 0),
- (i64 17),
- (assertsexti32 GPR:$rs1),
- (or (riscv_sllw (assertsexti32 GPR:$rs1),
- (and (assertsexti32 GPR:$rs3), 31)),
- (riscv_srlw (assertsexti32 GPR:$rs2),
- (sub (i64 32),
- (assertsexti32 GPR:$rs3))))),
+def : Pat<(sext_inreg (fshl (assertsexti32 GPR:$rs1),
+ (shl (assertsexti32 GPR:$rs2), (i64 32)),
+ (and (assertsexti32 GPR:$rs3), (i64 31))),
+ i32),
(FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
- (i64 0),
- (i64 17),
- (assertsexti32 GPR:$rs2),
- (or (riscv_sllw (assertsexti32 GPR:$rs1),
- (sub (i64 32),
- (assertsexti32 GPR:$rs3))),
- (riscv_srlw (assertsexti32 GPR:$rs2),
- (and (assertsexti32 GPR:$rs3), 31)))),
+def : Pat<(sext_inreg (fshr (assertsexti32 GPR:$rs1),
+ (shl (assertsexti32 GPR:$rs2), (i64 32)),
+ (or (assertsexti32 GPR:$rs3), (i64 32))),
+ i32),
(FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
- (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+def : Pat<(sext_inreg (fshr (assertsexti32 GPR:$rs1),
+ (shl (assertsexti32 GPR:$rs2), (i64 32)),
+ uimmlog2xlen:$shamt),
+ i32),
+ (FSRIW GPR:$rs1, GPR:$rs2, (ImmSub32 uimm5:$shamt))>;
} // Predicates = [HasStdExtZbt, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index 6777fecbb5d5..7c06dadf5f4e 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -110,8 +110,8 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) {
define i8 @rotr_i8_const_shift(i8 %x) {
; CHECK-LABEL: rotr_i8_const_shift:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w8, w0, #3, #5
-; CHECK-NEXT: bfi w8, w0, #5, #27
+; CHECK-NEXT: lsl w8, w0, #5
+; CHECK-NEXT: bfxil w8, w0, #3, #5
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: ret
%f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
@@ -138,7 +138,7 @@ define i16 @rotr_i16(i16 %x, i16 %z) {
; CHECK-NEXT: lsr w8, w8, w9
; CHECK-NEXT: and w9, w10, #0xf
; CHECK-NEXT: lsl w9, w0, w9
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
%f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
ret i16 %f
@@ -167,14 +167,14 @@ define i64 @rotr_i64(i64 %x, i64 %z) {
define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
; CHECK-LABEL: rotr_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4s, #31
-; CHECK-NEXT: neg v3.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: movi v3.4s, #31
+; CHECK-NEXT: neg v2.4s, v1.4s
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
ret <4 x i32> %f
@@ -185,8 +185,8 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) {
; CHECK-LABEL: rotr_v4i32_const_shift:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushr v1.4s, v0.4s, #3
-; CHECK-NEXT: shl v0.4s, v0.4s, #29
+; CHECK-NEXT: shl v1.4s, v0.4s, #29
+; CHECK-NEXT: ushr v0.4s, v0.4s, #3
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index 98815fe69559..011cbf476c38 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -18,12 +18,12 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshl_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ands w9, w2, #0x1f
-; CHECK-NEXT: neg w9, w9
+; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT: mvn w9, w2
+; CHECK-NEXT: lsr w10, w1, #1
; CHECK-NEXT: lsl w8, w0, w2
-; CHECK-NEXT: lsr w9, w1, w9
-; CHECK-NEXT: orr w8, w8, w9
-; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: lsr w9, w10, w9
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@@ -34,22 +34,19 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshl_i37:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x10, #31883
-; CHECK-NEXT: movk x10, #3542, lsl #16
-; CHECK-NEXT: movk x10, #51366, lsl #32
-; CHECK-NEXT: and x9, x2, #0x1fffffffff
-; CHECK-NEXT: movk x10, #56679, lsl #48
-; CHECK-NEXT: umulh x10, x9, x10
-; CHECK-NEXT: mov w11, #37
-; CHECK-NEXT: lsr x10, x10, #5
-; CHECK-NEXT: msub x9, x10, x11, x9
-; CHECK-NEXT: and x8, x1, #0x1fffffffff
-; CHECK-NEXT: sub x11, x11, x9
-; CHECK-NEXT: lsl x10, x0, x9
-; CHECK-NEXT: lsr x8, x8, x11
-; CHECK-NEXT: orr x8, x10, x8
-; CHECK-NEXT: cmp x9, #0 // =0
-; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: mov x8, #31883
+; CHECK-NEXT: movk x8, #3542, lsl #16
+; CHECK-NEXT: movk x8, #51366, lsl #32
+; CHECK-NEXT: movk x8, #56679, lsl #48
+; CHECK-NEXT: umulh x8, x2, x8
+; CHECK-NEXT: mov w9, #37
+; CHECK-NEXT: ubfx x8, x8, #5, #27
+; CHECK-NEXT: msub w8, w8, w9, w2
+; CHECK-NEXT: lsl x9, x0, x8
+; CHECK-NEXT: mvn w8, w8
+; CHECK-NEXT: ubfiz x10, x1, #26, #37
+; CHECK-NEXT: lsr x8, x10, x8
+; CHECK-NEXT: orr x0, x9, x8
; CHECK-NEXT: ret
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
@@ -145,12 +142,12 @@ define i8 @fshl_i8_const_fold() {
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshr_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ands w9, w2, #0x1f
-; CHECK-NEXT: neg w9, w9
+; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT: mvn w9, w2
+; CHECK-NEXT: lsl w10, w0, #1
; CHECK-NEXT: lsr w8, w1, w2
-; CHECK-NEXT: lsl w9, w0, w9
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: csel w0, w1, w8, eq
+; CHECK-NEXT: lsl w9, w10, w9
+; CHECK-NEXT: orr w0, w9, w8
; CHECK-NEXT: ret
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@@ -161,22 +158,21 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshr_i37:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x10, #31883
-; CHECK-NEXT: movk x10, #3542, lsl #16
-; CHECK-NEXT: movk x10, #51366, lsl #32
-; CHECK-NEXT: and x9, x2, #0x1fffffffff
-; CHECK-NEXT: movk x10, #56679, lsl #48
-; CHECK-NEXT: umulh x10, x9, x10
-; CHECK-NEXT: mov w11, #37
-; CHECK-NEXT: lsr x10, x10, #5
-; CHECK-NEXT: msub x9, x10, x11, x9
-; CHECK-NEXT: and x8, x1, #0x1fffffffff
-; CHECK-NEXT: sub x10, x11, x9
-; CHECK-NEXT: lsr x8, x8, x9
-; CHECK-NEXT: lsl x10, x0, x10
-; CHECK-NEXT: orr x8, x10, x8
-; CHECK-NEXT: cmp x9, #0 // =0
-; CHECK-NEXT: csel x0, x1, x8, eq
+; CHECK-NEXT: mov x8, #31883
+; CHECK-NEXT: movk x8, #3542, lsl #16
+; CHECK-NEXT: movk x8, #51366, lsl #32
+; CHECK-NEXT: movk x8, #56679, lsl #48
+; CHECK-NEXT: umulh x8, x2, x8
+; CHECK-NEXT: mov w9, #37
+; CHECK-NEXT: lsr x8, x8, #5
+; CHECK-NEXT: msub w8, w8, w9, w2
+; CHECK-NEXT: lsl x10, x1, #27
+; CHECK-NEXT: add w8, w8, #27 // =27
+; CHECK-NEXT: lsr x9, x10, x8
+; CHECK-NEXT: mvn w8, w8
+; CHECK-NEXT: lsl x10, x0, #1
+; CHECK-NEXT: lsl x8, x10, x8
+; CHECK-NEXT: orr x0, x8, x9
; CHECK-NEXT: ret
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
diff --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
index 2c2abe8e7bc7..691cdc4fa199 100644
--- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll
+++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
@@ -80,12 +80,12 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
; CHECK-LABEL: n6_fshl:
; CHECK: // %bb.0:
-; CHECK-NEXT: ands w9, w2, #0x1f
-; CHECK-NEXT: neg w9, w9
+; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT: mvn w9, w2
+; CHECK-NEXT: lsr w10, w1, #1
; CHECK-NEXT: lsl w8, w0, w2
-; CHECK-NEXT: lsr w9, w1, w9
-; CHECK-NEXT: orr w8, w8, w9
-; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: lsr w9, w10, w9
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
%shamt_wide = sext i8 %shamt to i32
%r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt_wide)
@@ -94,12 +94,12 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
; CHECK-LABEL: n7_fshr:
; CHECK: // %bb.0:
-; CHECK-NEXT: ands w9, w2, #0x1f
-; CHECK-NEXT: neg w9, w9
+; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT: mvn w9, w2
+; CHECK-NEXT: lsl w10, w0, #1
; CHECK-NEXT: lsr w8, w1, w2
-; CHECK-NEXT: lsl w9, w0, w9
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: csel w0, w1, w8, eq
+; CHECK-NEXT: lsl w9, w10, w9
+; CHECK-NEXT: orr w0, w9, w8
; CHECK-NEXT: ret
%shamt_wide = sext i8 %shamt to i32
%r = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt_wide)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 2ecce1807921..a20c047d556d 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -16,14 +16,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s3, 32, s2
+; SI-NEXT: s_sub_i32 s2, 0, s2
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: s_and_b32 s1, s2, 31
+; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -32,15 +28,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sub_i32 s3, 32, s2
+; VI-NEXT: s_sub_i32 s2, 0, s2
; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: s_and_b32 s1, s2, 31
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_cmp_eq_u32 s1, 0
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -51,15 +42,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_i32 s3, 32, s2
+; GFX9-NEXT: s_sub_i32 s2, 0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: s_and_b32 s1, s2, 31
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_cmp_eq_u32 s1, 0
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dword v[0:1], v2, off
@@ -67,17 +53,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
;
; R600-LABEL: fshl_i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[3].X,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, PV.W,
-; R600-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: CNDE_INT T0.X, PS, KC0[2].Z, PV.W,
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[3].X,
+; R600-NEXT: BIT_ALIGN_INT T0.X, KC0[2].Z, KC0[2].W, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
@@ -151,21 +133,13 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s9
-; SI-NEXT: s_sub_i32 s10, 32, s1
-; SI-NEXT: v_mov_b32_e32 v1, s10
-; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: s_sub_i32 s1, 32, s0
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: s_and_b32 s0, s0, 31
+; SI-NEXT: s_sub_i32 s1, 0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: s_sub_i32 s0, 0, s0
+; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s8
-; SI-NEXT: v_mov_b32_e32 v2, s1
+; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -177,23 +151,13 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_sub_i32 s8, 32, s1
-; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: v_mov_b32_e32 v1, s8
-; VI-NEXT: s_cmp_eq_u32 s1, 0
-; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_sub_i32 s1, 32, s0
-; VI-NEXT: s_and_b32 s0, s0, 31
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: s_cmp_eq_u32 s0, 0
+; VI-NEXT: s_sub_i32 s1, 0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_sub_i32 s0, 0, s0
+; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -207,23 +171,13 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: s_sub_i32 s8, 32, s1
-; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: s_cmp_eq_u32 s1, 0
-; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_sub_i32 s1, 32, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s0, 0
+; GFX9-NEXT: s_sub_i32 s1, 0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_sub_i32 s0, 0, s0
+; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
@@ -231,24 +185,16 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshl_v2i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
-; R600-NEXT: SUB_INT * T1.W, literal.y, KC0[4].X,
-; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: AND_INT T0.Y, KC0[3].W, literal.x,
-; R600-NEXT: SUB_INT T0.Z, literal.y, KC0[3].W,
-; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PS,
-; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
-; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].X,
-; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].W, KC0[3].Y, PV.Z,
-; R600-NEXT: SETE_INT * T1.W, PV.Y, 0.0,
-; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[2].W,
-; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[4].X,
+; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[3].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -332,37 +278,21 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: s_sub_i32 s16, 32, s3
-; SI-NEXT: v_mov_b32_e32 v1, s16
-; SI-NEXT: s_and_b32 s3, s3, 31
-; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; SI-NEXT: v_mov_b32_e32 v1, s11
-; SI-NEXT: s_sub_i32 s3, 32, s2
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT: s_and_b32 s2, s2, 31
-; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: s_sub_i32 s3, 0, s3
; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; SI-NEXT: v_mov_b32_e32 v1, s10
-; SI-NEXT: s_sub_i32 s2, 32, s1
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: s_sub_i32 s2, 0, s2
+; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1
+; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: s_sub_i32 s1, 32, s0
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: s_and_b32 s0, s0, 31
+; SI-NEXT: s_sub_i32 s1, 0, s1
+; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1
+; SI-NEXT: s_sub_i32 s0, 0, s0
+; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: v_mov_b32_e32 v4, s0
; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
-; SI-NEXT: v_mov_b32_e32 v4, s8
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -374,41 +304,21 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_sub_i32 s14, 32, s3
-; VI-NEXT: s_and_b32 s3, s3, 31
-; VI-NEXT: v_mov_b32_e32 v1, s14
-; VI-NEXT: s_cmp_eq_u32 s3, 0
-; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_sub_i32 s3, 32, s2
-; VI-NEXT: s_and_b32 s2, s2, 31
-; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT: s_cmp_eq_u32 s2, 0
-; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: s_sub_i32 s3, 0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_sub_i32 s2, 32, s1
-; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: s_cmp_eq_u32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: s_sub_i32 s2, 0, s2
+; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_sub_i32 s1, 32, s0
-; VI-NEXT: s_and_b32 s0, s0, 31
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: s_cmp_eq_u32 s0, 0
+; VI-NEXT: s_sub_i32 s1, 0, s1
+; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1
+; VI-NEXT: s_sub_i32 s0, 0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -422,41 +332,21 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
-; GFX9-NEXT: s_sub_i32 s14, 32, s3
-; GFX9-NEXT: s_and_b32 s3, s3, 31
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: s_cmp_eq_u32 s3, 0
-; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_sub_i32 s3, 32, s2
-; GFX9-NEXT: s_and_b32 s2, s2, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: s_sub_i32 s3, 0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: s_sub_i32 s2, 32, s1
-; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s1, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-NEXT: s_sub_i32 s2, 0, s2
+; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_sub_i32 s1, 32, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s0, 0
+; GFX9-NEXT: s_sub_i32 s1, 0, s1
+; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1
+; GFX9-NEXT: s_sub_i32 s0, 0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
@@ -464,35 +354,19 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshl_v4i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[6].X,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[6].X,
; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
-; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: AND_INT T0.X, KC0[5].Z, literal.x,
-; R600-NEXT: SUB_INT T0.Y, literal.y, KC0[5].Z,
-; R600-NEXT: SETE_INT T0.Z, PV.W, 0.0,
-; R600-NEXT: SUB_INT T1.W, literal.y, KC0[5].W,
-; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
-; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: SETE_INT T1.Z, PS, 0.0,
-; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].W, KC0[4].W, PV.W,
-; R600-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, KC0[4].X,
-; R600-NEXT: CNDE_INT T0.Z, T1.Z, T1.W, KC0[3].W,
-; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].Z, KC0[4].Z, T0.Y,
-; R600-NEXT: SETE_INT * T2.W, T0.X, 0.0,
-; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
-; R600-NEXT: AND_INT T1.W, KC0[5].Y, literal.x,
-; R600-NEXT: SUB_INT * T2.W, literal.y, KC0[5].Y,
-; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PS,
-; R600-NEXT: SETE_INT * T1.W, PV.W, 0.0,
-; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
+; R600-NEXT: SUB_INT * T1.W, 0.0, KC0[5].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
+; R600-NEXT: SUB_INT * T1.W, 0.0, KC0[5].Z,
+; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
+; R600-NEXT: SUB_INT * T1.W, 0.0, KC0[5].Y,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 768d25ee06ff..157330b8bd47 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -138,17 +138,11 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s9
-; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v2, s0
-; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -160,19 +154,11 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cmp_eq_u32 s1, 0
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; VI-NEXT: s_cmp_eq_u32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -186,19 +172,11 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_cmp_eq_u32 s1, 0
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
@@ -206,21 +184,15 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshr_v2i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
-; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
-; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
-; R600-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
-; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
-; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
+; R600-NEXT: MOV * T0.W, KC0[4].X,
+; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
+; R600-NEXT: MOV * T0.W, KC0[3].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
@@ -305,29 +277,17 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: s_and_b32 s3, s3, 31
; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; SI-NEXT: s_and_b32 s2, s2, 31
-; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v4, s0
-; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4
-; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -339,33 +299,17 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_and_b32 s3, s3, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_cmp_eq_u32 s3, 0
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_and_b32 s2, s2, 31
-; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
-; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; VI-NEXT: s_cmp_eq_u32 s2, 0
+; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1
-; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; VI-NEXT: s_cmp_eq_u32 s1, 0
+; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; VI-NEXT: s_cmp_eq_u32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -379,33 +323,17 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
-; GFX9-NEXT: s_and_b32 s3, s3, 31
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_cmp_eq_u32 s3, 0
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 31
-; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s1, 0
+; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
@@ -413,31 +341,20 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshr_v4i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 20, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
+; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[5].Z, literal.x,
-; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: SETE_INT T0.Z, PS, 0.0,
-; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
-; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: SETE_INT T1.Z, PV.W, 0.0,
-; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
-; R600-NEXT: CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
-; R600-NEXT: CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
-; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
-; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
-; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
-; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x,
-; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
-; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
-; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
-; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MOV * T0.W, KC0[6].X,
+; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
+; R600-NEXT: MOV * T1.W, KC0[5].W,
+; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
+; R600-NEXT: MOV * T1.W, KC0[5].Z,
+; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
+; R600-NEXT: MOV * T1.W, KC0[5].Y,
+; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
@@ -543,14 +460,8 @@ define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2
; GFX89-LABEL: v_fshr_v2i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_and_b32_e32 v4, 31, v4
; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX89-NEXT: v_and_b32_e32 v2, 31, v5
-; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v2
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i32:
@@ -565,18 +476,9 @@ define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2
; GFX89-LABEL: v_fshr_v3i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_and_b32_e32 v6, 31, v6
; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX89-NEXT: v_and_b32_e32 v3, 31, v7
-; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v3
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX89-NEXT: v_and_b32_e32 v3, 31, v8
-; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v3
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX89-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7
+; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v3i32:
@@ -591,22 +493,10 @@ define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2
; GFX89-LABEL: v_fshr_v4i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_and_b32_e32 v8, 31, v8
; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX89-NEXT: v_and_b32_e32 v4, 31, v9
-; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v4
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX89-NEXT: v_and_b32_e32 v4, 31, v10
-; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v4
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX89-NEXT: v_and_b32_e32 v4, 31, v11
-; GFX89-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v4
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX89-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9
+; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10
+; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v4i32:
@@ -621,38 +511,33 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
; SI-LABEL: v_fshr_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 15, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v2
-; SI-NEXT: v_lshr_b32_e32 v3, v3, v2
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v4
-; SI-NEXT: v_or_b32_e32 v0, v0, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: v_or_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v3, -1, v2
; VI-NEXT: v_and_b32_e32 v2, 15, v2
-; VI-NEXT: v_sub_u16_e32 v4, 16, v2
-; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v1
-; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; VI-NEXT: v_and_b32_e32 v3, 15, v3
+; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
+; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_sub_u16_e32 v4, 16, v2
-; GFX9-NEXT: v_lshrrev_b16_e32 v3, v2, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, v4, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_i16:
@@ -667,23 +552,12 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; SI-LABEL: v_fshr_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, 0xffff
-; SI-NEXT: v_and_b32_e32 v5, 15, v5
-; SI-NEXT: v_and_b32_e32 v7, s4, v3
-; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v5
-; SI-NEXT: v_lshr_b32_e32 v7, v7, v5
-; SI-NEXT: v_lshl_b32_e32 v1, v1, v8
-; SI-NEXT: v_or_b32_e32 v1, v1, v7
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; SI-NEXT: v_and_b32_e32 v3, 15, v4
-; SI-NEXT: v_sub_i32_e32 v5, vcc, 16, v3
-; SI-NEXT: v_and_b32_e32 v6, s4, v2
-; SI-NEXT: v_lshr_b32_e32 v4, v6, v3
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v5
-; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_or_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; SI-NEXT: v_or_b32_e32 v3, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -693,44 +567,36 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; VI-LABEL: v_fshr_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v3, 0xf000f, v2
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT: v_bfe_u32 v2, v2, 16, 4
-; VI-NEXT: v_lshrrev_b16_e32 v4, v3, v1
-; VI-NEXT: v_lshrrev_b16_sdwa v6, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v4, v4, v6
-; VI-NEXT: v_sub_u16_e32 v6, 16, v2
-; VI-NEXT: v_sub_u16_e32 v7, 16, v3
-; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_lshlrev_b16_e32 v0, v7, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v6
-; VI-NEXT: v_or_b32_e32 v0, v0, v4
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_and_b32_e32 v4, 15, v3
+; VI-NEXT: v_mov_b32_e32 v5, 1
+; VI-NEXT: v_xor_b32_e32 v3, -1, v3
+; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v3, 15, v3
+; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_xor_b32_e32 v4, -1, v2
+; VI-NEXT: v_and_b32_e32 v2, 15, v2
+; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; VI-NEXT: v_and_b32_e32 v4, 15, v4
+; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2
-; GFX9-NEXT: v_pk_sub_i16 v4, 16, v2 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0
-; GFX9-NEXT: v_pk_lshrrev_b16 v3, v2, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v2, v4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
+; GFX9-NEXT: s_mov_b32 s4, 0xf000f
+; GFX9-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
+; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i16:
@@ -745,105 +611,80 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; SI-LABEL: v_fshr_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_or_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7
+; SI-NEXT: v_or_b32_e32 v4, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
; SI-NEXT: s_mov_b32 s4, 0xffff
-; SI-NEXT: v_and_b32_e32 v7, 15, v7
-; SI-NEXT: v_and_b32_e32 v12, s4, v4
-; SI-NEXT: v_sub_i32_e32 v13, vcc, 16, v7
-; SI-NEXT: v_lshr_b32_e32 v12, v12, v7
-; SI-NEXT: v_lshl_b32_e32 v1, v1, v13
-; SI-NEXT: v_or_b32_e32 v1, v1, v12
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; SI-NEXT: v_and_b32_e32 v4, 15, v6
-; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4
-; SI-NEXT: v_and_b32_e32 v11, s4, v3
-; SI-NEXT: v_lshr_b32_e32 v6, v11, v4
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v7
-; SI-NEXT: v_or_b32_e32 v0, v0, v6
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; SI-NEXT: v_and_b32_e32 v3, 15, v8
-; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3
-; SI-NEXT: v_and_b32_e32 v10, s4, v5
-; SI-NEXT: v_lshr_b32_e32 v4, v10, v3
-; SI-NEXT: v_lshl_b32_e32 v2, v2, v6
-; SI-NEXT: v_mov_b32_e32 v9, 0xffff
-; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc
+; SI-NEXT: v_or_b32_e32 v3, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v0, v9, v0
+; SI-NEXT: v_and_b32_e32 v0, s4, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_and_b32_e32 v2, v9, v3
+; SI-NEXT: v_and_b32_e32 v2, s4, v3
; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v6, 15
-; VI-NEXT: v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; VI-NEXT: v_lshrrev_b16_e32 v8, v6, v7
-; VI-NEXT: v_sub_u16_e32 v6, 16, v6
-; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v6, v6, v8
-; VI-NEXT: v_bfe_u32 v8, v4, 16, 4
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
-; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
-; VI-NEXT: v_and_b32_e32 v7, 15, v5
-; VI-NEXT: v_lshrrev_b16_e32 v8, v7, v3
-; VI-NEXT: v_sub_u16_e32 v7, 16, v7
-; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; VI-NEXT: v_and_b32_e32 v7, 15, v6
+; VI-NEXT: v_mov_b32_e32 v8, 1
+; VI-NEXT: v_xor_b32_e32 v6, -1, v6
+; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v6, 15, v6
+; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8
+; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_xor_b32_e32 v7, -1, v5
; VI-NEXT: v_and_b32_e32 v5, 15, v5
-; VI-NEXT: v_or_b32_e32 v1, v1, v8
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; VI-NEXT: v_and_b32_e32 v3, 15, v4
-; VI-NEXT: v_lshrrev_b16_e32 v5, v3, v2
-; VI-NEXT: v_sub_u16_e32 v3, 16, v3
+; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; VI-NEXT: v_and_b32_e32 v7, 15, v7
+; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
+; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: v_xor_b32_e32 v3, -1, v4
+; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; VI-NEXT: v_and_b32_e32 v3, 15, v3
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
-; VI-NEXT: v_and_b32_e32 v3, 0xf000f, v4
-; VI-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 15, v4
+; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, 15
-; GFX9-NEXT: v_and_b32_e32 v6, 15, v4
-; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX9-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v6, v8, v6
-; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX9-NEXT: v_pk_lshrrev_b16 v7, v6, v2
-; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
-; GFX9-NEXT: s_mov_b32 s6, 0xf000f
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0
-; GFX9-NEXT: v_and_b32_e32 v4, s6, v4
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v7
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v4, v7 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v5
-; GFX9-NEXT: v_and_b32_e32 v2, v8, v2
-; GFX9-NEXT: v_pk_lshrrev_b16 v4, v2, v3
-; GFX9-NEXT: v_pk_sub_i16 v2, 16, v2
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1
-; GFX9-NEXT: v_and_b32_e32 v2, s6, v5
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX9-NEXT: v_and_b32_e32 v2, v8, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
+; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
+; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v3i16:
@@ -858,45 +699,24 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI-LABEL: v_fshr_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_or_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9
+; SI-NEXT: v_or_b32_e32 v5, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5
+; SI-NEXT: v_or_b32_e32 v4, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
+; SI-NEXT: v_or_b32_e32 v4, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; SI-NEXT: s_mov_b32 s4, 0xffff
-; SI-NEXT: v_and_b32_e32 v9, 15, v9
-; SI-NEXT: v_and_b32_e32 v16, s4, v5
-; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v9
-; SI-NEXT: v_lshr_b32_e32 v16, v16, v9
-; SI-NEXT: v_lshl_b32_e32 v1, v1, v17
-; SI-NEXT: v_or_b32_e32 v1, v1, v16
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; SI-NEXT: v_and_b32_e32 v5, 15, v8
-; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5
-; SI-NEXT: v_and_b32_e32 v15, s4, v4
-; SI-NEXT: v_lshr_b32_e32 v8, v15, v5
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v9
-; SI-NEXT: v_or_b32_e32 v0, v0, v8
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-NEXT: v_and_b32_e32 v4, 15, v11
-; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v4
-; SI-NEXT: v_and_b32_e32 v14, s4, v7
-; SI-NEXT: v_lshr_b32_e32 v5, v14, v4
-; SI-NEXT: v_lshl_b32_e32 v3, v3, v8
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v5
-; SI-NEXT: v_and_b32_e32 v4, 15, v10
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4
-; SI-NEXT: v_and_b32_e32 v13, s4, v6
-; SI-NEXT: v_lshr_b32_e32 v5, v13, v4
-; SI-NEXT: v_lshl_b32_e32 v2, v2, v7
-; SI-NEXT: v_or_b32_e32 v2, v2, v5
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mov_b32_e32 v12, 0xffff
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v2, v12, v2
+; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v0, v12, v0
+; SI-NEXT: v_and_b32_e32 v0, s4, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
@@ -905,89 +725,80 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; VI-LABEL: v_fshr_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v6, 15
-; VI-NEXT: v_and_b32_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
-; VI-NEXT: v_lshrrev_b16_e32 v9, v7, v8
-; VI-NEXT: v_sub_u16_e32 v7, 16, v7
-; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v7, v7, v9
-; VI-NEXT: v_bfe_u32 v9, v5, 16, 4
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
-; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
-; VI-NEXT: v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; VI-NEXT: v_lshrrev_b16_e32 v9, v6, v8
-; VI-NEXT: v_sub_u16_e32 v6, 16, v6
-; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v6, v6, v9
-; VI-NEXT: v_bfe_u32 v9, v4, 16, 4
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
-; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; VI-NEXT: v_and_b32_e32 v8, 15, v5
-; VI-NEXT: v_lshrrev_b16_e32 v9, v8, v3
-; VI-NEXT: v_sub_u16_e32 v8, 16, v8
-; VI-NEXT: s_mov_b32 s4, 0xf000f
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; VI-NEXT: v_and_b32_e32 v7, 15, v6
+; VI-NEXT: v_xor_b32_e32 v6, -1, v6
+; VI-NEXT: v_mov_b32_e32 v8, 1
+; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v6, 15, v6
+; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9
+; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; VI-NEXT: v_and_b32_e32 v9, 15, v7
+; VI-NEXT: v_xor_b32_e32 v7, -1, v7
+; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v7, 15, v7
+; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8
+; VI-NEXT: v_xor_b32_e32 v8, -1, v5
+; VI-NEXT: v_and_b32_e32 v5, 15, v5
+; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; VI-NEXT: v_and_b32_e32 v8, 15, v8
; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
-; VI-NEXT: v_and_b32_e32 v5, s4, v5
-; VI-NEXT: v_or_b32_e32 v1, v1, v9
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; VI-NEXT: v_and_b32_e32 v3, 15, v4
-; VI-NEXT: v_lshrrev_b16_e32 v5, v3, v2
-; VI-NEXT: v_sub_u16_e32 v3, 16, v3
+; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: v_xor_b32_e32 v3, -1, v4
+; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; VI-NEXT: v_and_b32_e32 v3, 15, v3
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
-; VI-NEXT: v_and_b32_e32 v3, s4, v4
-; VI-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 15, v4
+; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
+; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, 15
-; GFX9-NEXT: v_and_b32_e32 v6, 15, v5
-; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX9-NEXT: v_and_b32_sdwa v8, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v6, v9, v6
-; GFX9-NEXT: v_lshl_or_b32 v6, v8, 16, v6
-; GFX9-NEXT: v_pk_lshrrev_b16 v8, v6, v3
-; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
-; GFX9-NEXT: s_mov_b32 s6, 0xf000f
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, v6, v1
-; GFX9-NEXT: v_and_b32_e32 v5, s6, v5
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v8
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc
-; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v5, v8 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
+; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v7
+; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7
+; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
+; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
+; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
-; GFX9-NEXT: v_and_b32_sdwa v5, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v3, v9, v3
-; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3
-; GFX9-NEXT: v_pk_lshrrev_b16 v5, v3, v2
-; GFX9-NEXT: v_pk_sub_i16 v3, 16, v3 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
-; GFX9-NEXT: v_and_b32_e32 v3, s6, v4
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX9-NEXT: v_and_b32_e32 v2, v9, v4
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, v9, v6
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
+; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0
+; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v4i16:
@@ -1002,46 +813,40 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
; SI-LABEL: v_fshr_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 63, v4
+; SI-NEXT: v_not_b32_e32 v4, v4
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_and_b32_e32 v4, 63, v4
-; SI-NEXT: v_sub_i32_e32 v7, vcc, 64, v4
-; SI-NEXT: v_lshr_b64 v[5:6], v[2:3], v4
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v7
-; SI-NEXT: v_or_b32_e32 v0, v0, v5
-; SI-NEXT: v_mov_b32_e32 v5, 0
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; SI-NEXT: v_or_b32_e32 v1, v1, v6
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v5, 63, v4
+; VI-NEXT: v_not_b32_e32 v4, v4
+; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; VI-NEXT: v_and_b32_e32 v4, 63, v4
-; VI-NEXT: v_sub_u32_e32 v7, vcc, 64, v4
-; VI-NEXT: v_lshrrev_b64 v[5:6], v4, v[2:3]
-; VI-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; VI-NEXT: v_or_b32_e32 v0, v0, v5
-; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; VI-NEXT: v_or_b32_e32 v1, v1, v6
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT: v_not_b32_e32 v4, v4
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX9-NEXT: v_sub_u32_e32 v7, 64, v4
-; GFX9-NEXT: v_lshrrev_b64 v[5:6], v4, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_i64:
@@ -1056,73 +861,64 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
; SI-LABEL: v_fshr_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 63, v8
+; SI-NEXT: v_not_b32_e32 v8, v8
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_and_b32_e32 v8, 63, v8
-; SI-NEXT: v_sub_i32_e32 v9, vcc, 64, v8
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
-; SI-NEXT: v_lshr_b64 v[11:12], v[4:5], v8
-; SI-NEXT: v_mov_b32_e32 v9, 0
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SI-NEXT: v_or_b32_e32 v0, v0, v11
-; SI-NEXT: v_and_b32_e32 v8, 63, v10
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-NEXT: v_sub_i32_e64 v4, s[4:5], 64, v8
-; SI-NEXT: v_or_b32_e32 v1, v1, v12
-; SI-NEXT: v_lshr_b64 v[10:11], v[6:7], v8
-; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SI-NEXT: v_or_b32_e32 v3, v3, v11
-; SI-NEXT: v_or_b32_e32 v2, v2, v10
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9
+; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SI-NEXT: v_or_b32_e32 v1, v1, v5
+; SI-NEXT: v_and_b32_e32 v5, 63, v10
+; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5
+; SI-NEXT: v_not_b32_e32 v7, v10
+; SI-NEXT: v_and_b32_e32 v7, 63, v7
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v6
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v9, 63, v8
+; VI-NEXT: v_not_b32_e32 v8, v8
+; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; VI-NEXT: v_and_b32_e32 v8, 63, v8
-; VI-NEXT: v_sub_u32_e32 v9, vcc, 64, v8
-; VI-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
-; VI-NEXT: v_lshrrev_b64 v[11:12], v8, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v9, 0
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; VI-NEXT: v_or_b32_e32 v0, v0, v11
-; VI-NEXT: v_and_b32_e32 v8, 63, v10
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-NEXT: v_sub_u32_e64 v4, s[4:5], 64, v8
-; VI-NEXT: v_or_b32_e32 v1, v1, v12
-; VI-NEXT: v_lshrrev_b64 v[10:11], v8, v[6:7]
-; VI-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; VI-NEXT: v_or_b32_e32 v3, v3, v11
-; VI-NEXT: v_or_b32_e32 v2, v2, v10
-; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
+; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; VI-NEXT: v_or_b32_e32 v1, v1, v5
+; VI-NEXT: v_and_b32_e32 v5, 63, v10
+; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
+; VI-NEXT: v_not_b32_e32 v7, v10
+; VI-NEXT: v_and_b32_e32 v7, 63, v7
+; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
+; VI-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NEXT: v_or_b32_e32 v3, v3, v6
+; VI-NEXT: v_or_b32_e32 v2, v2, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT: v_not_b32_e32 v8, v8
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT: v_sub_u32_e32 v9, 64, v8
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[11:12], v8, v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v11
-; GFX9-NEXT: v_and_b32_e32 v8, 63, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v8
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v12
-; GFX9-NEXT: v_lshrrev_b64 v[10:11], v8, v[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v3, v3, v11
-; GFX9-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 63, v10
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
+; GFX9-NEXT: v_not_b32_e32 v7, v10
+; GFX9-NEXT: v_and_b32_e32 v7, 63, v7
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i64:
@@ -1137,60 +933,40 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; SI-LABEL: v_fshr_i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, 0xffffff
-; SI-NEXT: v_and_b32_e32 v2, s4, v2
-; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab
-; SI-NEXT: v_mul_hi_u32 v3, v2, s5
-; SI-NEXT: v_and_b32_e32 v4, s4, v1
+; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
+; SI-NEXT: v_mul_hi_u32 v3, v2, s4
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3
; SI-NEXT: v_mul_lo_u32 v3, v3, 24
; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_lshr_b32_e32 v3, v4, v2
-; SI-NEXT: v_sub_i32_e32 v4, vcc, 24, v2
-; SI-NEXT: v_and_b32_e32 v4, s4, v4
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v4
-; SI-NEXT: v_or_b32_e32 v0, v0, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
+; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, 0xffffff
-; VI-NEXT: v_and_b32_e32 v2, s4, v2
-; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab
-; VI-NEXT: v_mul_hi_u32 v3, v2, s5
-; VI-NEXT: v_and_b32_e32 v4, s4, v1
+; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
+; VI-NEXT: v_mul_hi_u32 v3, v2, s4
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3
; VI-NEXT: v_mul_lo_u32 v3, v3, 24
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
-; VI-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; VI-NEXT: v_sub_u32_e32 v4, vcc, 24, v2
-; VI-NEXT: v_and_b32_e32 v4, s4, v4
-; VI-NEXT: v_lshlrev_b32_e32 v0, v4, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffffff
-; GFX9-NEXT: v_and_b32_e32 v2, s4, v2
-; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab
-; GFX9-NEXT: v_mul_hi_u32 v3, v2, s5
-; GFX9-NEXT: v_and_b32_e32 v4, s4, v1
+; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
+; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3
; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX9-NEXT: v_sub_u32_e32 v4, 24, v2
-; GFX9-NEXT: v_and_b32_e32 v4, s4, v4
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v3
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
+; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_i24:
@@ -1205,49 +981,35 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; SI-LABEL: v_fshr_v2i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
-; SI-NEXT: s_mov_b32 s4, 0xffffff
-; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0
; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v14, s4, v1
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v2, s4, v2
-; SI-NEXT: v_mul_hi_u32 v12, v2, s5
+; SI-NEXT: v_mul_hi_u32 v11, v2, s4
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v3, s4, v3
-; SI-NEXT: v_mul_hi_u32 v13, v3, s5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v11, s4, v5
+; SI-NEXT: v_mul_hi_u32 v12, v3, s4
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v11, 4, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12
+; SI-NEXT: v_mul_lo_u32 v11, v11, 24
; SI-NEXT: v_mul_lo_u32 v12, v12, 24
-; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13
-; SI-NEXT: v_mul_lo_u32 v13, v13, 24
-; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
-; SI-NEXT: v_lshr_b32_e32 v12, v14, v2
-; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v13
-; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2
-; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3
-; SI-NEXT: v_and_b32_e32 v13, s4, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshl_b32_e32 v6, v6, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14
-; SI-NEXT: v_lshr_b32_e32 v11, v11, v3
-; SI-NEXT: v_lshl_b32_e32 v4, v4, v14
-; SI-NEXT: v_or_b32_e32 v6, v6, v12
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; SI-NEXT: v_or_b32_e32 v4, v4, v11
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v12
+; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
+; SI-NEXT: v_alignbit_b32 v1, v1, v6, v2
+; SI-NEXT: v_alignbit_b32 v2, v5, v4, v3
; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen
; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2
@@ -1264,49 +1026,35 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; VI-LABEL: v_fshr_v2i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32
-; VI-NEXT: s_mov_b32 s4, 0xffffff
-; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0
; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0
; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_and_b32_e32 v14, s4, v1
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_and_b32_e32 v2, s4, v2
-; VI-NEXT: v_mul_hi_u32 v12, v2, s5
+; VI-NEXT: v_mul_hi_u32 v11, v2, s4
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_and_b32_e32 v3, s4, v3
-; VI-NEXT: v_mul_hi_u32 v13, v3, s5
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_and_b32_e32 v11, s4, v5
+; VI-NEXT: v_mul_hi_u32 v12, v3, s4
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v11, 4, v11
; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12
+; VI-NEXT: v_mul_lo_u32 v11, v11, 24
; VI-NEXT: v_mul_lo_u32 v12, v12, 24
-; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13
-; VI-NEXT: v_mul_lo_u32 v13, v13, 24
-; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v12
-; VI-NEXT: v_lshrrev_b32_e32 v12, v2, v14
-; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v13
-; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2
-; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3
-; VI-NEXT: v_and_b32_e32 v13, s4, v13
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v6, v13, v6
-; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14
-; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11
-; VI-NEXT: v_lshlrev_b32_e32 v4, v14, v4
-; VI-NEXT: v_or_b32_e32 v6, v6, v12
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; VI-NEXT: v_or_b32_e32 v4, v4, v11
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v11
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v12
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
+; VI-NEXT: v_alignbit_b32 v1, v1, v6, v2
+; VI-NEXT: v_alignbit_b32 v2, v5, v4, v3
; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen
; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2
@@ -1323,40 +1071,30 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_mov_b32 s4, 0xffffff
-; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT: v_mul_hi_u32 v6, v1, s5
+; GFX9-NEXT: v_mul_hi_u32 v6, v1, s4
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_and_b32_e32 v2, s4, v2
-; GFX9-NEXT: v_mul_hi_u32 v7, v2, s5
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v9, s4, v4
+; GFX9-NEXT: v_mul_hi_u32 v7, v2, s4
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
-; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v10, s4, v8
-; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, v1, v10
+; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
-; GFX9-NEXT: v_sub_u32_e32 v7, 24, v1
-; GFX9-NEXT: v_sub_u32_e32 v10, 24, v2
-; GFX9-NEXT: v_and_b32_e32 v7, s4, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, v2, v9
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10
-; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc
-; GFX9-NEXT: v_lshl_or_b32 v3, v3, v10, v9
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
+; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
+; GFX9-NEXT: v_add_u32_e32 v1, 8, v1
+; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
index 525dc8624798..d661dd56a065 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
@@ -126,7 +126,7 @@ define i16 @rotr_i16(i16 %x, i16 %z) {
; CHECK-NEXT: clrlwi 5, 5, 28
; CHECK-NEXT: srw 4, 6, 4
; CHECK-NEXT: slw 3, 3, 5
-; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: or 3, 4, 3
; CHECK-NEXT: blr
%f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
ret i16 %f
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
index 364ab29de385..66be4606ada2 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@@ -47,21 +47,20 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshl_i37:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 6, -8857
-; CHECK-NEXT: clrldi 5, 5, 27
+; CHECK-NEXT: sldi 4, 4, 27
; CHECK-NEXT: ori 6, 6, 51366
-; CHECK-NEXT: clrldi 4, 4, 27
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 6, 6, 3542
; CHECK-NEXT: ori 6, 6, 31883
; CHECK-NEXT: mulhdu 6, 5, 6
; CHECK-NEXT: rldicl 6, 6, 59, 5
; CHECK-NEXT: mulli 6, 6, 37
-; CHECK-NEXT: sub. 5, 5, 6
-; CHECK-NEXT: subfic 6, 5, 37
-; CHECK-NEXT: sld 5, 3, 5
+; CHECK-NEXT: sub 5, 5, 6
+; CHECK-NEXT: clrlwi 5, 5, 26
+; CHECK-NEXT: subfic 6, 5, 64
+; CHECK-NEXT: sld 3, 3, 5
; CHECK-NEXT: srd 4, 4, 6
-; CHECK-NEXT: or 4, 5, 4
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: or 3, 3, 4
; CHECK-NEXT: blr
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
@@ -165,7 +164,7 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshr_i37:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 6, -8857
-; CHECK-NEXT: clrldi 5, 5, 27
+; CHECK-NEXT: sldi 4, 4, 27
; CHECK-NEXT: ori 6, 6, 51366
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 6, 6, 3542
@@ -173,13 +172,13 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-NEXT: mulhdu 6, 5, 6
; CHECK-NEXT: rldicl 6, 6, 59, 5
; CHECK-NEXT: mulli 6, 6, 37
-; CHECK-NEXT: sub. 5, 5, 6
-; CHECK-NEXT: clrldi 6, 4, 27
-; CHECK-NEXT: subfic 7, 5, 37
-; CHECK-NEXT: srd 5, 6, 5
-; CHECK-NEXT: sld 3, 3, 7
-; CHECK-NEXT: or 3, 3, 5
-; CHECK-NEXT: iseleq 3, 4, 3
+; CHECK-NEXT: sub 5, 5, 6
+; CHECK-NEXT: addi 5, 5, 27
+; CHECK-NEXT: clrlwi 5, 5, 26
+; CHECK-NEXT: subfic 6, 5, 64
+; CHECK-NEXT: srd 4, 4, 5
+; CHECK-NEXT: sld 3, 3, 6
+; CHECK-NEXT: or 3, 3, 4
; CHECK-NEXT: blr
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbbp.ll
index 0e6288928f0c..7d1a7d0f7a32 100644
--- a/llvm/test/CodeGen/RISCV/rv32Zbbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32Zbbp.ll
@@ -411,7 +411,7 @@ define i32 @ror_i32(i32 %a, i32 %b) nounwind {
; RV32I-NEXT: srl a2, a0, a1
; RV32I-NEXT: neg a1, a1
; RV32I-NEXT: sll a0, a0, a1
-; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: or a0, a2, a0
; RV32I-NEXT: ret
;
; RV32IB-LABEL: ror_i32:
@@ -469,21 +469,21 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: srl a0, a0, a4
; RV32I-NEXT: or a4, a3, a0
-; RV32I-NEXT: or a0, t0, a7
+; RV32I-NEXT: or a0, a7, t0
; RV32I-NEXT: bgez t1, .LBB9_9
; RV32I-NEXT: .LBB9_6:
; RV32I-NEXT: srl a1, a1, a2
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB9_7:
; RV32I-NEXT: sll t0, a0, a4
; RV32I-NEXT: bltz a3, .LBB9_5
; RV32I-NEXT: .LBB9_8:
; RV32I-NEXT: sll a4, a0, a3
-; RV32I-NEXT: or a0, t0, a7
+; RV32I-NEXT: or a0, a7, t0
; RV32I-NEXT: bltz t1, .LBB9_6
; RV32I-NEXT: .LBB9_9:
-; RV32I-NEXT: or a1, a4, zero
+; RV32I-NEXT: or a1, zero, a4
; RV32I-NEXT: ret
;
; RV32IB-LABEL: ror_i64:
@@ -515,21 +515,21 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV32IB-NEXT: srli a0, a0, 1
; RV32IB-NEXT: srl a0, a0, a4
; RV32IB-NEXT: or a4, a3, a0
-; RV32IB-NEXT: or a0, t0, a7
+; RV32IB-NEXT: or a0, a7, t0
; RV32IB-NEXT: bgez t1, .LBB9_9
; RV32IB-NEXT: .LBB9_6:
; RV32IB-NEXT: srl a1, a1, a2
-; RV32IB-NEXT: or a1, a4, a1
+; RV32IB-NEXT: or a1, a1, a4
; RV32IB-NEXT: ret
; RV32IB-NEXT: .LBB9_7:
; RV32IB-NEXT: sll t0, a0, a4
; RV32IB-NEXT: bltz a3, .LBB9_5
; RV32IB-NEXT: .LBB9_8:
; RV32IB-NEXT: sll a4, a0, a3
-; RV32IB-NEXT: or a0, t0, a7
+; RV32IB-NEXT: or a0, a7, t0
; RV32IB-NEXT: bltz t1, .LBB9_6
; RV32IB-NEXT: .LBB9_9:
-; RV32IB-NEXT: or a1, a4, zero
+; RV32IB-NEXT: or a1, zero, a4
; RV32IB-NEXT: ret
;
; RV32IBB-LABEL: ror_i64:
@@ -561,21 +561,21 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV32IBB-NEXT: srli a0, a0, 1
; RV32IBB-NEXT: srl a0, a0, a4
; RV32IBB-NEXT: or a4, a3, a0
-; RV32IBB-NEXT: or a0, t0, a7
+; RV32IBB-NEXT: or a0, a7, t0
; RV32IBB-NEXT: bgez t1, .LBB9_9
; RV32IBB-NEXT: .LBB9_6:
; RV32IBB-NEXT: srl a1, a1, a2
-; RV32IBB-NEXT: or a1, a4, a1
+; RV32IBB-NEXT: or a1, a1, a4
; RV32IBB-NEXT: ret
; RV32IBB-NEXT: .LBB9_7:
; RV32IBB-NEXT: sll t0, a0, a4
; RV32IBB-NEXT: bltz a3, .LBB9_5
; RV32IBB-NEXT: .LBB9_8:
; RV32IBB-NEXT: sll a4, a0, a3
-; RV32IBB-NEXT: or a0, t0, a7
+; RV32IBB-NEXT: or a0, a7, t0
; RV32IBB-NEXT: bltz t1, .LBB9_6
; RV32IBB-NEXT: .LBB9_9:
-; RV32IBB-NEXT: or a1, a4, zero
+; RV32IBB-NEXT: or a1, zero, a4
; RV32IBB-NEXT: ret
;
; RV32IBP-LABEL: ror_i64:
@@ -607,21 +607,21 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV32IBP-NEXT: srli a0, a0, 1
; RV32IBP-NEXT: srl a0, a0, a4
; RV32IBP-NEXT: or a4, a3, a0
-; RV32IBP-NEXT: or a0, t0, a7
+; RV32IBP-NEXT: or a0, a7, t0
; RV32IBP-NEXT: bgez t1, .LBB9_9
; RV32IBP-NEXT: .LBB9_6:
; RV32IBP-NEXT: srl a1, a1, a2
-; RV32IBP-NEXT: or a1, a4, a1
+; RV32IBP-NEXT: or a1, a1, a4
; RV32IBP-NEXT: ret
; RV32IBP-NEXT: .LBB9_7:
; RV32IBP-NEXT: sll t0, a0, a4
; RV32IBP-NEXT: bltz a3, .LBB9_5
; RV32IBP-NEXT: .LBB9_8:
; RV32IBP-NEXT: sll a4, a0, a3
-; RV32IBP-NEXT: or a0, t0, a7
+; RV32IBP-NEXT: or a0, a7, t0
; RV32IBP-NEXT: bltz t1, .LBB9_6
; RV32IBP-NEXT: .LBB9_9:
-; RV32IBP-NEXT: or a1, a4, zero
+; RV32IBP-NEXT: or a1, zero, a4
; RV32IBP-NEXT: ret
%or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
ret i64 %or
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbt.ll b/llvm/test/CodeGen/RISCV/rv32Zbt.ll
index 54b5b79778f4..bb14a53c6e22 100644
--- a/llvm/test/CodeGen/RISCV/rv32Zbt.ll
+++ b/llvm/test/CodeGen/RISCV/rv32Zbt.ll
@@ -122,15 +122,11 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
define i32 @fshl_i32(i32 %a, i32 %b, i32 %c) nounwind {
; RV32I-LABEL: fshl_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: andi a3, a2, 31
-; RV32I-NEXT: beqz a3, .LBB4_2
-; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: sll a0, a0, a2
-; RV32I-NEXT: addi a2, zero, 32
-; RV32I-NEXT: sub a2, a2, a3
+; RV32I-NEXT: not a2, a2
+; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: srl a1, a1, a2
; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: .LBB4_2:
; RV32I-NEXT: ret
;
; RV32IB-LABEL: fshl_i32:
@@ -157,158 +153,149 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind {
; RV32I-LABEL: fshl_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: andi t1, a4, 63
-; RV32I-NEXT: addi a6, t1, -32
-; RV32I-NEXT: addi a7, zero, 31
-; RV32I-NEXT: bltz a6, .LBB5_2
+; RV32I-NEXT: andi a5, a4, 63
+; RV32I-NEXT: addi t1, a5, -32
+; RV32I-NEXT: addi a6, zero, 31
+; RV32I-NEXT: bltz t1, .LBB5_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sll t0, a0, a6
+; RV32I-NEXT: sll a7, a0, t1
; RV32I-NEXT: j .LBB5_3
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: sll t0, a1, a4
-; RV32I-NEXT: sub t2, a7, t1
-; RV32I-NEXT: srli a5, a0, 1
-; RV32I-NEXT: srl a5, a5, t2
-; RV32I-NEXT: or t0, t0, a5
+; RV32I-NEXT: sll a7, a1, a4
+; RV32I-NEXT: sub a5, a6, a5
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: srl a1, a1, a5
+; RV32I-NEXT: or a7, a7, a1
; RV32I-NEXT: .LBB5_3:
-; RV32I-NEXT: addi a5, zero, 32
-; RV32I-NEXT: sub t4, a5, t1
-; RV32I-NEXT: addi a5, zero, 64
-; RV32I-NEXT: sub t2, a5, t1
-; RV32I-NEXT: bltz t4, .LBB5_5
+; RV32I-NEXT: not a1, a4
+; RV32I-NEXT: andi t3, a1, 63
+; RV32I-NEXT: addi a5, t3, -32
+; RV32I-NEXT: srli t2, a3, 1
+; RV32I-NEXT: bltz a5, .LBB5_7
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: mv t3, zero
-; RV32I-NEXT: bnez t1, .LBB5_6
-; RV32I-NEXT: j .LBB5_7
+; RV32I-NEXT: mv t0, zero
+; RV32I-NEXT: bgez a5, .LBB5_8
; RV32I-NEXT: .LBB5_5:
-; RV32I-NEXT: srl t3, a3, t2
-; RV32I-NEXT: beqz t1, .LBB5_7
+; RV32I-NEXT: slli a3, a3, 31
+; RV32I-NEXT: srli a2, a2, 1
+; RV32I-NEXT: or a2, a2, a3
+; RV32I-NEXT: srl a1, a2, a1
+; RV32I-NEXT: sub a2, a6, t3
+; RV32I-NEXT: slli a3, t2, 1
+; RV32I-NEXT: sll a2, a3, a2
+; RV32I-NEXT: or a2, a1, a2
+; RV32I-NEXT: or a1, a7, t0
+; RV32I-NEXT: bgez t1, .LBB5_9
; RV32I-NEXT: .LBB5_6:
-; RV32I-NEXT: or a1, t0, t3
+; RV32I-NEXT: sll a0, a0, a4
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: ret
; RV32I-NEXT: .LBB5_7:
-; RV32I-NEXT: bltz t4, .LBB5_10
-; RV32I-NEXT: # %bb.8:
-; RV32I-NEXT: srl a2, a3, t4
-; RV32I-NEXT: bgez a6, .LBB5_11
+; RV32I-NEXT: srl t0, t2, a1
+; RV32I-NEXT: bltz a5, .LBB5_5
+; RV32I-NEXT: .LBB5_8:
+; RV32I-NEXT: srl a2, t2, a5
+; RV32I-NEXT: or a1, a7, t0
+; RV32I-NEXT: bltz t1, .LBB5_6
; RV32I-NEXT: .LBB5_9:
-; RV32I-NEXT: sll a3, a0, a4
-; RV32I-NEXT: bnez t1, .LBB5_12
-; RV32I-NEXT: j .LBB5_13
-; RV32I-NEXT: .LBB5_10:
-; RV32I-NEXT: srl a2, a2, t2
-; RV32I-NEXT: sub a5, a7, t2
-; RV32I-NEXT: slli a3, a3, 1
-; RV32I-NEXT: sll a3, a3, a5
-; RV32I-NEXT: or a2, a2, a3
-; RV32I-NEXT: bltz a6, .LBB5_9
-; RV32I-NEXT: .LBB5_11:
-; RV32I-NEXT: mv a3, zero
-; RV32I-NEXT: beqz t1, .LBB5_13
-; RV32I-NEXT: .LBB5_12:
-; RV32I-NEXT: or a0, a3, a2
-; RV32I-NEXT: .LBB5_13:
+; RV32I-NEXT: or a0, zero, a2
; RV32I-NEXT: ret
;
; RV32IB-LABEL: fshl_i64:
; RV32IB: # %bb.0:
-; RV32IB-NEXT: andi t1, a4, 63
-; RV32IB-NEXT: addi a6, t1, -32
-; RV32IB-NEXT: addi a7, zero, 31
-; RV32IB-NEXT: bltz a6, .LBB5_2
+; RV32IB-NEXT: andi a5, a4, 63
+; RV32IB-NEXT: addi t2, a5, -32
+; RV32IB-NEXT: addi a6, zero, 31
+; RV32IB-NEXT: bltz t2, .LBB5_2
; RV32IB-NEXT: # %bb.1:
-; RV32IB-NEXT: sll t0, a0, a6
+; RV32IB-NEXT: sll a7, a0, t2
; RV32IB-NEXT: j .LBB5_3
; RV32IB-NEXT: .LBB5_2:
-; RV32IB-NEXT: sll t0, a1, a4
-; RV32IB-NEXT: sub t2, a7, t1
-; RV32IB-NEXT: srli a5, a0, 1
-; RV32IB-NEXT: srl a5, a5, t2
-; RV32IB-NEXT: or t0, t0, a5
+; RV32IB-NEXT: sll a7, a1, a4
+; RV32IB-NEXT: sub a5, a6, a5
+; RV32IB-NEXT: srli a1, a0, 1
+; RV32IB-NEXT: srl a1, a1, a5
+; RV32IB-NEXT: or a7, a7, a1
; RV32IB-NEXT: .LBB5_3:
-; RV32IB-NEXT: addi a5, zero, 32
-; RV32IB-NEXT: sub t4, a5, t1
-; RV32IB-NEXT: addi a5, zero, 64
-; RV32IB-NEXT: sub t2, a5, t1
-; RV32IB-NEXT: bltz t4, .LBB5_7
+; RV32IB-NEXT: not t1, a4
+; RV32IB-NEXT: addi a1, zero, 63
+; RV32IB-NEXT: andn a5, a1, a4
+; RV32IB-NEXT: addi a1, a5, -32
+; RV32IB-NEXT: srli t3, a3, 1
+; RV32IB-NEXT: bltz a1, .LBB5_7
; RV32IB-NEXT: # %bb.4:
-; RV32IB-NEXT: mv t3, zero
-; RV32IB-NEXT: or t0, t0, t3
-; RV32IB-NEXT: bgez t4, .LBB5_8
+; RV32IB-NEXT: mv t0, zero
+; RV32IB-NEXT: bgez a1, .LBB5_8
; RV32IB-NEXT: .LBB5_5:
-; RV32IB-NEXT: srl a2, a2, t2
-; RV32IB-NEXT: sub a5, a7, t2
-; RV32IB-NEXT: slli a3, a3, 1
-; RV32IB-NEXT: sll a3, a3, a5
-; RV32IB-NEXT: or a2, a2, a3
-; RV32IB-NEXT: cmov a1, t1, t0, a1
-; RV32IB-NEXT: bgez a6, .LBB5_9
+; RV32IB-NEXT: fsl a1, a3, a6, a2
+; RV32IB-NEXT: srl a1, a1, t1
+; RV32IB-NEXT: sub a2, a6, a5
+; RV32IB-NEXT: slli a3, t3, 1
+; RV32IB-NEXT: sll a2, a3, a2
+; RV32IB-NEXT: or a2, a1, a2
+; RV32IB-NEXT: or a1, a7, t0
+; RV32IB-NEXT: bgez t2, .LBB5_9
; RV32IB-NEXT: .LBB5_6:
-; RV32IB-NEXT: sll a3, a0, a4
-; RV32IB-NEXT: j .LBB5_10
+; RV32IB-NEXT: sll a0, a0, a4
+; RV32IB-NEXT: or a0, a0, a2
+; RV32IB-NEXT: ret
; RV32IB-NEXT: .LBB5_7:
-; RV32IB-NEXT: srl t3, a3, t2
-; RV32IB-NEXT: or t0, t0, t3
-; RV32IB-NEXT: bltz t4, .LBB5_5
+; RV32IB-NEXT: srl t0, t3, t1
+; RV32IB-NEXT: bltz a1, .LBB5_5
; RV32IB-NEXT: .LBB5_8:
-; RV32IB-NEXT: srl a2, a3, t4
-; RV32IB-NEXT: cmov a1, t1, t0, a1
-; RV32IB-NEXT: bltz a6, .LBB5_6
+; RV32IB-NEXT: srl a2, t3, a1
+; RV32IB-NEXT: or a1, a7, t0
+; RV32IB-NEXT: bltz t2, .LBB5_6
; RV32IB-NEXT: .LBB5_9:
-; RV32IB-NEXT: mv a3, zero
-; RV32IB-NEXT: .LBB5_10:
-; RV32IB-NEXT: or a2, a3, a2
-; RV32IB-NEXT: cmov a0, t1, a2, a0
+; RV32IB-NEXT: or a0, zero, a2
; RV32IB-NEXT: ret
;
; RV32IBT-LABEL: fshl_i64:
; RV32IBT: # %bb.0:
-; RV32IBT-NEXT: andi t1, a4, 63
-; RV32IBT-NEXT: addi a6, t1, -32
-; RV32IBT-NEXT: addi a7, zero, 31
-; RV32IBT-NEXT: bltz a6, .LBB5_2
+; RV32IBT-NEXT: andi a5, a4, 63
+; RV32IBT-NEXT: addi t1, a5, -32
+; RV32IBT-NEXT: addi a6, zero, 31
+; RV32IBT-NEXT: bltz t1, .LBB5_2
; RV32IBT-NEXT: # %bb.1:
-; RV32IBT-NEXT: sll t0, a0, a6
+; RV32IBT-NEXT: sll a7, a0, t1
; RV32IBT-NEXT: j .LBB5_3
; RV32IBT-NEXT: .LBB5_2:
-; RV32IBT-NEXT: sll t0, a1, a4
-; RV32IBT-NEXT: sub t2, a7, t1
-; RV32IBT-NEXT: srli a5, a0, 1
-; RV32IBT-NEXT: srl a5, a5, t2
-; RV32IBT-NEXT: or t0, t0, a5
+; RV32IBT-NEXT: sll a7, a1, a4
+; RV32IBT-NEXT: sub a5, a6, a5
+; RV32IBT-NEXT: srli a1, a0, 1
+; RV32IBT-NEXT: srl a1, a1, a5
+; RV32IBT-NEXT: or a7, a7, a1
; RV32IBT-NEXT: .LBB5_3:
-; RV32IBT-NEXT: addi a5, zero, 32
-; RV32IBT-NEXT: sub t4, a5, t1
-; RV32IBT-NEXT: addi a5, zero, 64
-; RV32IBT-NEXT: sub t2, a5, t1
-; RV32IBT-NEXT: bltz t4, .LBB5_7
+; RV32IBT-NEXT: not a1, a4
+; RV32IBT-NEXT: andi t3, a1, 63
+; RV32IBT-NEXT: addi a5, t3, -32
+; RV32IBT-NEXT: srli t2, a3, 1
+; RV32IBT-NEXT: bltz a5, .LBB5_7
; RV32IBT-NEXT: # %bb.4:
-; RV32IBT-NEXT: mv t3, zero
-; RV32IBT-NEXT: or t0, t0, t3
-; RV32IBT-NEXT: bgez t4, .LBB5_8
+; RV32IBT-NEXT: mv t0, zero
+; RV32IBT-NEXT: bgez a5, .LBB5_8
; RV32IBT-NEXT: .LBB5_5:
-; RV32IBT-NEXT: srl a2, a2, t2
-; RV32IBT-NEXT: sub a5, a7, t2
-; RV32IBT-NEXT: slli a3, a3, 1
-; RV32IBT-NEXT: sll a3, a3, a5
-; RV32IBT-NEXT: or a2, a2, a3
-; RV32IBT-NEXT: cmov a1, t1, t0, a1
-; RV32IBT-NEXT: bgez a6, .LBB5_9
+; RV32IBT-NEXT: fsl a2, a3, a6, a2
+; RV32IBT-NEXT: srl a1, a2, a1
+; RV32IBT-NEXT: sub a2, a6, t3
+; RV32IBT-NEXT: slli a3, t2, 1
+; RV32IBT-NEXT: sll a2, a3, a2
+; RV32IBT-NEXT: or a2, a1, a2
+; RV32IBT-NEXT: or a1, a7, t0
+; RV32IBT-NEXT: bgez t1, .LBB5_9
; RV32IBT-NEXT: .LBB5_6:
-; RV32IBT-NEXT: sll a3, a0, a4
-; RV32IBT-NEXT: j .LBB5_10
+; RV32IBT-NEXT: sll a0, a0, a4
+; RV32IBT-NEXT: or a0, a0, a2
+; RV32IBT-NEXT: ret
; RV32IBT-NEXT: .LBB5_7:
-; RV32IBT-NEXT: srl t3, a3, t2
-; RV32IBT-NEXT: or t0, t0, t3
-; RV32IBT-NEXT: bltz t4, .LBB5_5
+; RV32IBT-NEXT: srl t0, t2, a1
+; RV32IBT-NEXT: bltz a5, .LBB5_5
; RV32IBT-NEXT: .LBB5_8:
-; RV32IBT-NEXT: srl a2, a3, t4
-; RV32IBT-NEXT: cmov a1, t1, t0, a1
-; RV32IBT-NEXT: bltz a6, .LBB5_6
+; RV32IBT-NEXT: srl a2, t2, a5
+; RV32IBT-NEXT: or a1, a7, t0
+; RV32IBT-NEXT: bltz t1, .LBB5_6
; RV32IBT-NEXT: .LBB5_9:
-; RV32IBT-NEXT: mv a3, zero
-; RV32IBT-NEXT: .LBB5_10:
-; RV32IBT-NEXT: or a2, a3, a2
-; RV32IBT-NEXT: cmov a0, t1, a2, a0
+; RV32IBT-NEXT: or a0, zero, a2
; RV32IBT-NEXT: ret
%1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
ret i64 %1
@@ -319,16 +306,11 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
define i32 @fshr_i32(i32 %a, i32 %b, i32 %c) nounwind {
; RV32I-LABEL: fshr_i32:
; RV32I: # %bb.0:
-; RV32I-NEXT: andi a3, a2, 31
-; RV32I-NEXT: beqz a3, .LBB6_2
-; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srl a1, a1, a2
-; RV32I-NEXT: addi a2, zero, 32
-; RV32I-NEXT: sub a2, a2, a3
+; RV32I-NEXT: not a2, a2
+; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: sll a0, a0, a2
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: .LBB6_2:
-; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
; RV32IB-LABEL: fshr_i32:
@@ -355,162 +337,157 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind {
; RV32I-LABEL: fshr_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: mv t1, a3
-; RV32I-NEXT: mv a6, a2
; RV32I-NEXT: andi a5, a4, 63
-; RV32I-NEXT: addi t2, a5, -32
-; RV32I-NEXT: addi a7, zero, 31
-; RV32I-NEXT: bltz t2, .LBB7_2
+; RV32I-NEXT: addi t1, a5, -32
+; RV32I-NEXT: addi a6, zero, 31
+; RV32I-NEXT: bltz t1, .LBB7_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: srl t0, t1, t2
+; RV32I-NEXT: srl a7, a3, t1
; RV32I-NEXT: j .LBB7_3
; RV32I-NEXT: .LBB7_2:
-; RV32I-NEXT: srl t0, a6, a4
-; RV32I-NEXT: sub a3, a7, a5
-; RV32I-NEXT: slli a2, t1, 1
-; RV32I-NEXT: sll a2, a2, a3
-; RV32I-NEXT: or t0, t0, a2
+; RV32I-NEXT: srl a7, a2, a4
+; RV32I-NEXT: sub a5, a6, a5
+; RV32I-NEXT: slli a2, a3, 1
+; RV32I-NEXT: sll a2, a2, a5
+; RV32I-NEXT: or a7, a7, a2
; RV32I-NEXT: .LBB7_3:
-; RV32I-NEXT: addi a2, zero, 32
-; RV32I-NEXT: sub a3, a2, a5
-; RV32I-NEXT: addi a2, zero, 64
-; RV32I-NEXT: sub a2, a2, a5
-; RV32I-NEXT: bltz a3, .LBB7_5
+; RV32I-NEXT: not a2, a4
+; RV32I-NEXT: andi t2, a2, 63
+; RV32I-NEXT: addi a5, t2, -32
+; RV32I-NEXT: slli t3, a0, 1
+; RV32I-NEXT: bltz a5, .LBB7_7
; RV32I-NEXT: # %bb.4:
-; RV32I-NEXT: mv t3, zero
-; RV32I-NEXT: bnez a5, .LBB7_6
-; RV32I-NEXT: j .LBB7_7
+; RV32I-NEXT: mv t0, zero
+; RV32I-NEXT: bgez a5, .LBB7_8
; RV32I-NEXT: .LBB7_5:
-; RV32I-NEXT: sll t3, a0, a2
-; RV32I-NEXT: beqz a5, .LBB7_7
+; RV32I-NEXT: lui a5, 524288
+; RV32I-NEXT: addi a5, a5, -1
+; RV32I-NEXT: and t3, a0, a5
+; RV32I-NEXT: sub a5, a6, t2
+; RV32I-NEXT: srl a5, t3, a5
+; RV32I-NEXT: srli a0, a0, 31
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: sll a0, a0, a2
+; RV32I-NEXT: or a1, a0, a5
+; RV32I-NEXT: or a0, t0, a7
+; RV32I-NEXT: bgez t1, .LBB7_9
; RV32I-NEXT: .LBB7_6:
-; RV32I-NEXT: or a6, t3, t0
+; RV32I-NEXT: srl a2, a3, a4
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: ret
; RV32I-NEXT: .LBB7_7:
-; RV32I-NEXT: bltz a3, .LBB7_10
-; RV32I-NEXT: # %bb.8:
-; RV32I-NEXT: sll a0, a0, a3
-; RV32I-NEXT: bgez t2, .LBB7_11
+; RV32I-NEXT: sll t0, t3, a2
+; RV32I-NEXT: bltz a5, .LBB7_5
+; RV32I-NEXT: .LBB7_8:
+; RV32I-NEXT: sll a1, t3, a5
+; RV32I-NEXT: or a0, t0, a7
+; RV32I-NEXT: bltz t1, .LBB7_6
; RV32I-NEXT: .LBB7_9:
-; RV32I-NEXT: srl a1, t1, a4
-; RV32I-NEXT: bnez a5, .LBB7_12
-; RV32I-NEXT: j .LBB7_13
-; RV32I-NEXT: .LBB7_10:
-; RV32I-NEXT: sll a1, a1, a2
-; RV32I-NEXT: sub a2, a7, a2
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: srl a0, a0, a2
-; RV32I-NEXT: or a0, a1, a0
-; RV32I-NEXT: bltz t2, .LBB7_9
-; RV32I-NEXT: .LBB7_11:
-; RV32I-NEXT: mv a1, zero
-; RV32I-NEXT: beqz a5, .LBB7_13
-; RV32I-NEXT: .LBB7_12:
-; RV32I-NEXT: or t1, a0, a1
-; RV32I-NEXT: .LBB7_13:
-; RV32I-NEXT: mv a0, a6
-; RV32I-NEXT: mv a1, t1
+; RV32I-NEXT: or a1, a1, zero
; RV32I-NEXT: ret
;
; RV32IB-LABEL: fshr_i64:
; RV32IB: # %bb.0:
-; RV32IB-NEXT: andi t1, a4, 63
-; RV32IB-NEXT: addi a6, t1, -32
-; RV32IB-NEXT: addi a7, zero, 31
-; RV32IB-NEXT: bltz a6, .LBB7_2
+; RV32IB-NEXT: andi a5, a4, 63
+; RV32IB-NEXT: addi t2, a5, -32
+; RV32IB-NEXT: addi a6, zero, 31
+; RV32IB-NEXT: bltz t2, .LBB7_2
; RV32IB-NEXT: # %bb.1:
-; RV32IB-NEXT: srl t0, a3, a6
+; RV32IB-NEXT: srl a7, a3, t2
; RV32IB-NEXT: j .LBB7_3
; RV32IB-NEXT: .LBB7_2:
-; RV32IB-NEXT: srl t0, a2, a4
-; RV32IB-NEXT: sub t2, a7, t1
-; RV32IB-NEXT: slli a5, a3, 1
-; RV32IB-NEXT: sll a5, a5, t2
-; RV32IB-NEXT: or t0, t0, a5
+; RV32IB-NEXT: srl a7, a2, a4
+; RV32IB-NEXT: sub a5, a6, a5
+; RV32IB-NEXT: slli a2, a3, 1
+; RV32IB-NEXT: sll a2, a2, a5
+; RV32IB-NEXT: or a7, a7, a2
; RV32IB-NEXT: .LBB7_3:
-; RV32IB-NEXT: addi a5, zero, 32
-; RV32IB-NEXT: sub t4, a5, t1
-; RV32IB-NEXT: addi a5, zero, 64
-; RV32IB-NEXT: sub t2, a5, t1
-; RV32IB-NEXT: bltz t4, .LBB7_7
+; RV32IB-NEXT: not t1, a4
+; RV32IB-NEXT: addi a2, zero, 63
+; RV32IB-NEXT: andn a2, a2, a4
+; RV32IB-NEXT: addi a5, a2, -32
+; RV32IB-NEXT: slli t3, a0, 1
+; RV32IB-NEXT: bltz a5, .LBB7_7
; RV32IB-NEXT: # %bb.4:
-; RV32IB-NEXT: mv t3, zero
-; RV32IB-NEXT: or t0, t3, t0
-; RV32IB-NEXT: bgez t4, .LBB7_8
+; RV32IB-NEXT: mv t0, zero
+; RV32IB-NEXT: bgez a5, .LBB7_8
; RV32IB-NEXT: .LBB7_5:
-; RV32IB-NEXT: sll a1, a1, t2
-; RV32IB-NEXT: sub a5, a7, t2
-; RV32IB-NEXT: srli a0, a0, 1
-; RV32IB-NEXT: srl a0, a0, a5
+; RV32IB-NEXT: addi a5, zero, 1
+; RV32IB-NEXT: fsl a1, a1, a5, a0
+; RV32IB-NEXT: sll a1, a1, t1
+; RV32IB-NEXT: sub a2, a6, a2
+; RV32IB-NEXT: lui a5, 524288
+; RV32IB-NEXT: addi a5, a5, -1
+; RV32IB-NEXT: and a0, a0, a5
+; RV32IB-NEXT: srl a0, a0, a2
; RV32IB-NEXT: or a1, a1, a0
-; RV32IB-NEXT: cmov a0, t1, t0, a2
-; RV32IB-NEXT: bgez a6, .LBB7_9
+; RV32IB-NEXT: or a0, t0, a7
+; RV32IB-NEXT: bgez t2, .LBB7_9
; RV32IB-NEXT: .LBB7_6:
; RV32IB-NEXT: srl a2, a3, a4
-; RV32IB-NEXT: j .LBB7_10
+; RV32IB-NEXT: or a1, a1, a2
+; RV32IB-NEXT: ret
; RV32IB-NEXT: .LBB7_7:
-; RV32IB-NEXT: sll t3, a0, t2
-; RV32IB-NEXT: or t0, t3, t0
-; RV32IB-NEXT: bltz t4, .LBB7_5
+; RV32IB-NEXT: sll t0, t3, t1
+; RV32IB-NEXT: bltz a5, .LBB7_5
; RV32IB-NEXT: .LBB7_8:
-; RV32IB-NEXT: sll a1, a0, t4
-; RV32IB-NEXT: cmov a0, t1, t0, a2
-; RV32IB-NEXT: bltz a6, .LBB7_6
+; RV32IB-NEXT: sll a1, t3, a5
+; RV32IB-NEXT: or a0, t0, a7
+; RV32IB-NEXT: bltz t2, .LBB7_6
; RV32IB-NEXT: .LBB7_9:
-; RV32IB-NEXT: mv a2, zero
-; RV32IB-NEXT: .LBB7_10:
-; RV32IB-NEXT: or a1, a1, a2
-; RV32IB-NEXT: cmov a1, t1, a1, a3
+; RV32IB-NEXT: or a1, a1, zero
; RV32IB-NEXT: ret
;
; RV32IBT-LABEL: fshr_i64:
; RV32IBT: # %bb.0:
-; RV32IBT-NEXT: andi t1, a4, 63
-; RV32IBT-NEXT: addi a6, t1, -32
-; RV32IBT-NEXT: addi a7, zero, 31
-; RV32IBT-NEXT: bltz a6, .LBB7_2
+; RV32IBT-NEXT: not a7, a4
+; RV32IBT-NEXT: andi t1, a7, 63
+; RV32IBT-NEXT: addi t0, zero, 31
+; RV32IBT-NEXT: addi t2, t1, -32
+; RV32IBT-NEXT: slli a6, a0, 1
+; RV32IBT-NEXT: bltz t2, .LBB7_2
; RV32IBT-NEXT: # %bb.1:
-; RV32IBT-NEXT: srl t0, a3, a6
+; RV32IBT-NEXT: sll t1, a6, t2
; RV32IBT-NEXT: j .LBB7_3
; RV32IBT-NEXT: .LBB7_2:
-; RV32IBT-NEXT: srl t0, a2, a4
-; RV32IBT-NEXT: sub t2, a7, t1
-; RV32IBT-NEXT: slli a5, a3, 1
-; RV32IBT-NEXT: sll a5, a5, t2
-; RV32IBT-NEXT: or t0, t0, a5
+; RV32IBT-NEXT: addi a5, zero, 1
+; RV32IBT-NEXT: fsl a1, a1, a5, a0
+; RV32IBT-NEXT: sll a1, a1, a7
+; RV32IBT-NEXT: lui a5, 524288
+; RV32IBT-NEXT: addi a5, a5, -1
+; RV32IBT-NEXT: and a0, a0, a5
+; RV32IBT-NEXT: sub a5, t0, t1
+; RV32IBT-NEXT: srl a0, a0, a5
+; RV32IBT-NEXT: or t1, a1, a0
; RV32IBT-NEXT: .LBB7_3:
-; RV32IBT-NEXT: addi a5, zero, 32
-; RV32IBT-NEXT: sub t4, a5, t1
-; RV32IBT-NEXT: addi a5, zero, 64
-; RV32IBT-NEXT: sub t2, a5, t1
-; RV32IBT-NEXT: bltz t4, .LBB7_7
+; RV32IBT-NEXT: andi a0, a4, 63
+; RV32IBT-NEXT: addi a5, a0, -32
+; RV32IBT-NEXT: bltz a5, .LBB7_7
; RV32IBT-NEXT: # %bb.4:
-; RV32IBT-NEXT: mv t3, zero
-; RV32IBT-NEXT: or t0, t3, t0
-; RV32IBT-NEXT: bgez t4, .LBB7_8
+; RV32IBT-NEXT: mv a1, zero
+; RV32IBT-NEXT: bgez a5, .LBB7_8
; RV32IBT-NEXT: .LBB7_5:
-; RV32IBT-NEXT: sll a1, a1, t2
-; RV32IBT-NEXT: sub a5, a7, t2
-; RV32IBT-NEXT: srli a0, a0, 1
-; RV32IBT-NEXT: srl a0, a0, a5
-; RV32IBT-NEXT: or a1, a1, a0
-; RV32IBT-NEXT: cmov a0, t1, t0, a2
-; RV32IBT-NEXT: bgez a6, .LBB7_9
+; RV32IBT-NEXT: srl a2, a2, a4
+; RV32IBT-NEXT: sub a0, t0, a0
+; RV32IBT-NEXT: slli a3, a3, 1
+; RV32IBT-NEXT: sll a0, a3, a0
+; RV32IBT-NEXT: or a2, a2, a0
+; RV32IBT-NEXT: or a1, t1, a1
+; RV32IBT-NEXT: bgez t2, .LBB7_9
; RV32IBT-NEXT: .LBB7_6:
-; RV32IBT-NEXT: srl a2, a3, a4
-; RV32IBT-NEXT: j .LBB7_10
+; RV32IBT-NEXT: sll a0, a6, a7
+; RV32IBT-NEXT: or a0, a0, a2
+; RV32IBT-NEXT: ret
; RV32IBT-NEXT: .LBB7_7:
-; RV32IBT-NEXT: sll t3, a0, t2
-; RV32IBT-NEXT: or t0, t3, t0
-; RV32IBT-NEXT: bltz t4, .LBB7_5
+; RV32IBT-NEXT: srl a1, a3, a4
+; RV32IBT-NEXT: bltz a5, .LBB7_5
; RV32IBT-NEXT: .LBB7_8:
-; RV32IBT-NEXT: sll a1, a0, t4
-; RV32IBT-NEXT: cmov a0, t1, t0, a2
-; RV32IBT-NEXT: bltz a6, .LBB7_6
+; RV32IBT-NEXT: srl a2, a3, a5
+; RV32IBT-NEXT: or a1, t1, a1
+; RV32IBT-NEXT: bltz t2, .LBB7_6
; RV32IBT-NEXT: .LBB7_9:
-; RV32IBT-NEXT: mv a2, zero
-; RV32IBT-NEXT: .LBB7_10:
-; RV32IBT-NEXT: or a1, a1, a2
-; RV32IBT-NEXT: cmov a1, t1, a1, a3
+; RV32IBT-NEXT: or a0, zero, a2
; RV32IBT-NEXT: ret
%1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbbp.ll
index c3a6799739d2..f78a4a3a000a 100644
--- a/llvm/test/CodeGen/RISCV/rv64Zbbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64Zbbp.ll
@@ -230,7 +230,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-NEXT: srlw a2, a0, a1
; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sllw a0, a0, a1
-; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
;
; RV64IB-LABEL: ror_i32:
@@ -259,7 +259,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV64I-NEXT: srl a2, a0, a1
; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sll a0, a0, a1
-; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
;
; RV64IB-LABEL: ror_i64:
@@ -291,7 +291,7 @@ define signext i32 @rori_i32(i32 signext %a) nounwind {
;
; RV64IB-LABEL: rori_i32:
; RV64IB: # %bb.0:
-; RV64IB-NEXT: fsriw a0, a0, a0, 1
+; RV64IB-NEXT: roriw a0, a0, 1
; RV64IB-NEXT: ret
;
; RV64IBB-LABEL: rori_i32:
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbt.ll b/llvm/test/CodeGen/RISCV/rv64Zbt.ll
index 22e25fadbd91..3e6201bac967 100644
--- a/llvm/test/CodeGen/RISCV/rv64Zbt.ll
+++ b/llvm/test/CodeGen/RISCV/rv64Zbt.ll
@@ -109,15 +109,14 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
define signext i32 @fshl_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
; RV64I-LABEL: fshl_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: andi a3, a2, 31
-; RV64I-NEXT: beqz a3, .LBB4_2
-; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: addi a4, zero, 32
-; RV64I-NEXT: sub a2, a4, a2
-; RV64I-NEXT: srlw a1, a1, a2
-; RV64I-NEXT: sllw a0, a0, a3
+; RV64I-NEXT: andi a2, a2, 31
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: not a2, a2
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: srli a1, a1, 1
+; RV64I-NEXT: srl a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: .LBB4_2:
+; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
;
; RV64IB-LABEL: fshl_i32:
@@ -138,15 +137,11 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind {
; RV64I-LABEL: fshl_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: andi a3, a2, 63
-; RV64I-NEXT: beqz a3, .LBB5_2
-; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: sll a0, a0, a2
-; RV64I-NEXT: addi a2, zero, 64
-; RV64I-NEXT: sub a2, a2, a3
+; RV64I-NEXT: not a2, a2
+; RV64I-NEXT: srli a1, a1, 1
; RV64I-NEXT: srl a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: .LBB5_2:
; RV64I-NEXT: ret
;
; RV64IB-LABEL: fshl_i64:
@@ -167,16 +162,15 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
define signext i32 @fshr_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
; RV64I-LABEL: fshr_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: andi a3, a2, 31
-; RV64I-NEXT: beqz a3, .LBB6_2
-; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: srlw a1, a1, a3
-; RV64I-NEXT: addi a3, zero, 32
-; RV64I-NEXT: sub a2, a3, a2
-; RV64I-NEXT: sllw a0, a0, a2
-; RV64I-NEXT: or a1, a0, a1
-; RV64I-NEXT: .LBB6_2:
-; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: andi a2, a2, 31
+; RV64I-NEXT: ori a3, a2, 32
+; RV64I-NEXT: srl a1, a1, a3
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: xori a2, a2, 31
+; RV64I-NEXT: sll a0, a0, a2
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
;
; RV64IB-LABEL: fshr_i32:
@@ -197,16 +191,11 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind {
; RV64I-LABEL: fshr_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: andi a3, a2, 63
-; RV64I-NEXT: beqz a3, .LBB7_2
-; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: srl a1, a1, a2
-; RV64I-NEXT: addi a2, zero, 64
-; RV64I-NEXT: sub a2, a2, a3
+; RV64I-NEXT: not a2, a2
+; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: sll a0, a0, a2
-; RV64I-NEXT: or a1, a0, a1
-; RV64I-NEXT: .LBB7_2:
-; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV64IB-LABEL: fshr_i64:
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 154ebabf812d..24b946265e28 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -178,58 +178,41 @@ define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-FAST-LABEL: var_shift_i64:
; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: pushl %ebp
; X86-FAST-NEXT: pushl %ebx
; X86-FAST-NEXT: pushl %edi
; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: pushl %eax
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT: andl $63, %ebx
-; X86-FAST-NEXT: movl %eax, %edi
-; X86-FAST-NEXT: movl %ebx, %ecx
-; X86-FAST-NEXT: shll %cl, %edi
-; X86-FAST-NEXT: shldl %cl, %eax, %ebp
-; X86-FAST-NEXT: testb $32, %bl
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-FAST-NEXT: movb %ch, %cl
+; X86-FAST-NEXT: notb %cl
+; X86-FAST-NEXT: shrdl $1, %edi, %esi
+; X86-FAST-NEXT: shrl %edi
+; X86-FAST-NEXT: shrdl %cl, %edi, %esi
+; X86-FAST-NEXT: shrl %cl, %edi
+; X86-FAST-NEXT: testb $32, %cl
; X86-FAST-NEXT: je .LBB5_2
; X86-FAST-NEXT: # %bb.1:
-; X86-FAST-NEXT: movl %edi, %ebp
+; X86-FAST-NEXT: movl %edi, %esi
; X86-FAST-NEXT: xorl %edi, %edi
; X86-FAST-NEXT: .LBB5_2:
-; X86-FAST-NEXT: movb $64, %cl
-; X86-FAST-NEXT: subb %bl, %cl
-; X86-FAST-NEXT: movl %edx, %esi
-; X86-FAST-NEXT: shrl %cl, %esi
-; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-FAST-NEXT: testb $32, %cl
-; X86-FAST-NEXT: jne .LBB5_3
-; X86-FAST-NEXT: # %bb.4:
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-FAST-NEXT: testl %ebx, %ebx
-; X86-FAST-NEXT: jne .LBB5_6
-; X86-FAST-NEXT: jmp .LBB5_7
-; X86-FAST-NEXT: .LBB5_3:
-; X86-FAST-NEXT: movl %esi, %ecx
-; X86-FAST-NEXT: xorl %esi, %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT: testl %ebx, %ebx
-; X86-FAST-NEXT: je .LBB5_7
-; X86-FAST-NEXT: .LBB5_6:
-; X86-FAST-NEXT: orl %esi, %ebp
-; X86-FAST-NEXT: orl %ecx, %edi
-; X86-FAST-NEXT: movl %edi, %eax
-; X86-FAST-NEXT: movl %ebp, %edx
-; X86-FAST-NEXT: .LBB5_7:
-; X86-FAST-NEXT: addl $4, %esp
+; X86-FAST-NEXT: movl %ebx, %eax
+; X86-FAST-NEXT: movb %ch, %cl
+; X86-FAST-NEXT: shll %cl, %eax
+; X86-FAST-NEXT: shldl %cl, %ebx, %edx
+; X86-FAST-NEXT: testb $32, %ch
+; X86-FAST-NEXT: je .LBB5_4
+; X86-FAST-NEXT: # %bb.3:
+; X86-FAST-NEXT: movl %eax, %edx
+; X86-FAST-NEXT: xorl %eax, %eax
+; X86-FAST-NEXT: .LBB5_4:
+; X86-FAST-NEXT: orl %edi, %edx
+; X86-FAST-NEXT: orl %esi, %eax
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: popl %edi
; X86-FAST-NEXT: popl %ebx
-; X86-FAST-NEXT: popl %ebp
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i64:
@@ -238,59 +221,55 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT: andl $63, %ebx
-; X86-SLOW-NEXT: movb $64, %ch
-; X86-SLOW-NEXT: subb %bl, %ch
-; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: shrl %cl, %edx
-; X86-SLOW-NEXT: notb %cl
-; X86-SLOW-NEXT: addl %eax, %eax
-; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: movb %bl, %cl
-; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movl %esi, %ebp
-; X86-SLOW-NEXT: shrl %ebp
-; X86-SLOW-NEXT: notb %cl
-; X86-SLOW-NEXT: shrl %cl, %ebp
-; X86-SLOW-NEXT: movb %bl, %cl
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testb $32, %bl
+; X86-SLOW-NEXT: shrl %eax
+; X86-SLOW-NEXT: movl %esi, %edi
+; X86-SLOW-NEXT: shll $31, %edi
+; X86-SLOW-NEXT: orl %eax, %edi
+; X86-SLOW-NEXT: movl %ecx, %eax
+; X86-SLOW-NEXT: movb %cl, %ch
+; X86-SLOW-NEXT: notb %ch
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: shrl %esi
+; X86-SLOW-NEXT: leal (%esi,%esi), %ebp
+; X86-SLOW-NEXT: movb %al, %cl
+; X86-SLOW-NEXT: shll %cl, %ebp
+; X86-SLOW-NEXT: shll %cl, %ebx
+; X86-SLOW-NEXT: movl %edx, %eax
+; X86-SLOW-NEXT: shrl %eax
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: testb $32, {{[0-9]+}}(%esp)
; X86-SLOW-NEXT: jne .LBB5_1
; X86-SLOW-NEXT: # %bb.2:
-; X86-SLOW-NEXT: orl %ebp, %edi
+; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: jmp .LBB5_3
; X86-SLOW-NEXT: .LBB5_1:
-; X86-SLOW-NEXT: movl %esi, %edi
-; X86-SLOW-NEXT: xorl %esi, %esi
+; X86-SLOW-NEXT: movl %edx, %ebx
+; X86-SLOW-NEXT: xorl %edx, %edx
; X86-SLOW-NEXT: .LBB5_3:
; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT: shrl %cl, %ebp
+; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb $32, %ch
; X86-SLOW-NEXT: jne .LBB5_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: orl %edx, %eax
-; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: orl %edi, %ebp
; X86-SLOW-NEXT: jmp .LBB5_6
; X86-SLOW-NEXT: .LBB5_4:
-; X86-SLOW-NEXT: movl %ebp, %ecx
-; X86-SLOW-NEXT: xorl %ebp, %ebp
+; X86-SLOW-NEXT: movl %esi, %ebp
+; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: .LBB5_6:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: je .LBB5_8
-; X86-SLOW-NEXT: # %bb.7:
-; X86-SLOW-NEXT: orl %ebp, %edi
-; X86-SLOW-NEXT: orl %ecx, %esi
-; X86-SLOW-NEXT: movl %edi, %edx
-; X86-SLOW-NEXT: movl %esi, %eax
-; X86-SLOW-NEXT: .LBB5_8:
+; X86-SLOW-NEXT: orl %ebp, %edx
+; X86-SLOW-NEXT: orl %esi, %ebx
+; X86-SLOW-NEXT: movl %edx, %eax
+; X86-SLOW-NEXT: movl %ebx, %edx
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 73071c19980e..33824080f337 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -179,46 +179,37 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-FAST-NEXT: pushl %ebx
; X86-FAST-NEXT: pushl %edi
; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: pushl %eax
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT: andl $63, %ebx
-; X86-FAST-NEXT: movb $64, %cl
-; X86-FAST-NEXT: subb %bl, %cl
-; X86-FAST-NEXT: movl %eax, %edi
-; X86-FAST-NEXT: shll %cl, %edi
-; X86-FAST-NEXT: shldl %cl, %eax, %esi
-; X86-FAST-NEXT: testb $32, %cl
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-FAST-NEXT: movb %bl, %ch
+; X86-FAST-NEXT: notb %ch
+; X86-FAST-NEXT: shldl $1, %eax, %edx
+; X86-FAST-NEXT: addl %eax, %eax
+; X86-FAST-NEXT: movb %ch, %cl
+; X86-FAST-NEXT: shldl %cl, %eax, %edx
+; X86-FAST-NEXT: movl %ebp, %edi
+; X86-FAST-NEXT: movb %bl, %cl
+; X86-FAST-NEXT: shrl %cl, %edi
+; X86-FAST-NEXT: shrdl %cl, %ebp, %esi
+; X86-FAST-NEXT: testb $32, %bl
; X86-FAST-NEXT: je .LBB5_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edi, %esi
; X86-FAST-NEXT: xorl %edi, %edi
; X86-FAST-NEXT: .LBB5_2:
-; X86-FAST-NEXT: movl %edx, %ebp
-; X86-FAST-NEXT: movl %ebx, %ecx
-; X86-FAST-NEXT: shrl %cl, %ebp
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: shrdl %cl, %edx, %eax
-; X86-FAST-NEXT: testb $32, %bl
+; X86-FAST-NEXT: movb %ch, %cl
+; X86-FAST-NEXT: shll %cl, %eax
+; X86-FAST-NEXT: testb $32, %ch
; X86-FAST-NEXT: je .LBB5_4
; X86-FAST-NEXT: # %bb.3:
-; X86-FAST-NEXT: movl %ebp, %eax
-; X86-FAST-NEXT: xorl %ebp, %ebp
+; X86-FAST-NEXT: movl %eax, %edx
+; X86-FAST-NEXT: xorl %eax, %eax
; X86-FAST-NEXT: .LBB5_4:
-; X86-FAST-NEXT: testl %ebx, %ebx
-; X86-FAST-NEXT: je .LBB5_6
-; X86-FAST-NEXT: # %bb.5:
-; X86-FAST-NEXT: orl %ebp, %esi
-; X86-FAST-NEXT: orl %eax, %edi
-; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-FAST-NEXT: movl %esi, %edx
-; X86-FAST-NEXT: .LBB5_6:
-; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-FAST-NEXT: addl $4, %esp
+; X86-FAST-NEXT: orl %edi, %edx
+; X86-FAST-NEXT: orl %esi, %eax
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: popl %edi
; X86-FAST-NEXT: popl %ebx
@@ -231,62 +222,55 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: subl $8, %esp
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT: pushl %eax
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %bl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT: andl $63, %ebx
-; X86-SLOW-NEXT: movb $64, %ch
-; X86-SLOW-NEXT: subb %bl, %ch
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movl %eax, %edi
+; X86-SLOW-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: shrl $31, %ecx
+; X86-SLOW-NEXT: leal (%ecx,%edx,2), %edx
+; X86-SLOW-NEXT: movb %bl, %ch
+; X86-SLOW-NEXT: notb %ch
; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: notb %cl
-; X86-SLOW-NEXT: shrl %esi
-; X86-SLOW-NEXT: shrl %cl, %esi
+; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: movb %bl, %cl
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: notb %cl
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: leal (%eax,%eax), %ebp
+; X86-SLOW-NEXT: shrl %cl, %ebp
+; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: leal (%esi,%esi), %ebp
+; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: shll %cl, %ebp
; X86-SLOW-NEXT: movb %bl, %cl
-; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: jne .LBB5_1
; X86-SLOW-NEXT: # %bb.2:
-; X86-SLOW-NEXT: orl %edi, %ebp
+; X86-SLOW-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
; X86-SLOW-NEXT: jmp .LBB5_3
; X86-SLOW-NEXT: .LBB5_1:
-; X86-SLOW-NEXT: movl %eax, %ebp
-; X86-SLOW-NEXT: xorl %eax, %eax
+; X86-SLOW-NEXT: movl %esi, %ebp
+; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: .LBB5_3:
+; X86-SLOW-NEXT: addl %eax, %eax
; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: testb $32, %ch
-; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: jne .LBB5_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SLOW-NEXT: orl %esi, %ecx
+; X86-SLOW-NEXT: orl %edi, %edx
; X86-SLOW-NEXT: jmp .LBB5_6
; X86-SLOW-NEXT: .LBB5_4:
-; X86-SLOW-NEXT: movl %edi, %ecx
-; X86-SLOW-NEXT: xorl %edi, %edi
+; X86-SLOW-NEXT: movl %eax, %edx
+; X86-SLOW-NEXT: xorl %eax, %eax
; X86-SLOW-NEXT: .LBB5_6:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: je .LBB5_8
-; X86-SLOW-NEXT: # %bb.7:
-; X86-SLOW-NEXT: orl %ebp, %edi
-; X86-SLOW-NEXT: orl (%esp), %ecx # 4-byte Folded Reload
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: movl %ecx, %edx
-; X86-SLOW-NEXT: .LBB5_8:
-; X86-SLOW-NEXT: addl $8, %esp
+; X86-SLOW-NEXT: orl %esi, %edx
+; X86-SLOW-NEXT: orl %ebp, %eax
+; X86-SLOW-NEXT: addl $4, %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 76e45f43342f..2120cb2581b9 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -45,46 +45,40 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X32-SSE2-NEXT: pushl %ebx
; X32-SSE2-NEXT: pushl %edi
; X32-SSE2-NEXT: pushl %esi
-; X32-SSE2-NEXT: pushl %eax
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT: andl $31, %esi
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: andl $31, %eax
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT: shldl $27, %ebx, %edi
+; X32-SSE2-NEXT: shll $27, %ebx
+; X32-SSE2-NEXT: shrdl $1, %edi, %ebx
+; X32-SSE2-NEXT: shrl %edi
; X32-SSE2-NEXT: pushl $0
; X32-SSE2-NEXT: pushl $37
-; X32-SSE2-NEXT: pushl %eax
+; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
; X32-SSE2-NEXT: calll __umoddi3
; X32-SSE2-NEXT: addl $16, %esp
-; X32-SSE2-NEXT: movl %eax, %ebx
-; X32-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X32-SSE2-NEXT: movl %ebp, %edx
-; X32-SSE2-NEXT: movl %ebx, %ecx
-; X32-SSE2-NEXT: shll %cl, %ebp
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: shldl %cl, %edx, %eax
-; X32-SSE2-NEXT: xorl %ecx, %ecx
-; X32-SSE2-NEXT: testb $32, %bl
-; X32-SSE2-NEXT: cmovnel %ebp, %eax
-; X32-SSE2-NEXT: cmovnel %ecx, %ebp
-; X32-SSE2-NEXT: xorl %edx, %edx
-; X32-SSE2-NEXT: movb $37, %cl
-; X32-SSE2-NEXT: subb %bl, %cl
-; X32-SSE2-NEXT: shrdl %cl, %esi, %edi
-; X32-SSE2-NEXT: shrl %cl, %esi
+; X32-SSE2-NEXT: movl %eax, %edx
+; X32-SSE2-NEXT: movl %edx, %ecx
+; X32-SSE2-NEXT: notb %cl
+; X32-SSE2-NEXT: shrdl %cl, %edi, %ebx
+; X32-SSE2-NEXT: shrl %cl, %edi
+; X32-SSE2-NEXT: xorl %eax, %eax
; X32-SSE2-NEXT: testb $32, %cl
-; X32-SSE2-NEXT: cmovnel %esi, %edi
-; X32-SSE2-NEXT: cmovnel %edx, %esi
-; X32-SSE2-NEXT: orl %eax, %esi
-; X32-SSE2-NEXT: orl %ebp, %edi
-; X32-SSE2-NEXT: orl %ebx, (%esp) # 4-byte Folded Spill
-; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edi
-; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT: movl %edi, %eax
+; X32-SSE2-NEXT: cmovnel %edi, %ebx
+; X32-SSE2-NEXT: cmovnel %eax, %edi
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movl %edx, %ecx
+; X32-SSE2-NEXT: shll %cl, %eax
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT: shldl %cl, %ebp, %esi
+; X32-SSE2-NEXT: testb $32, %dl
+; X32-SSE2-NEXT: cmovnel %eax, %esi
+; X32-SSE2-NEXT: movl $0, %ecx
+; X32-SSE2-NEXT: cmovnel %ecx, %eax
+; X32-SSE2-NEXT: orl %ebx, %eax
+; X32-SSE2-NEXT: orl %edi, %esi
; X32-SSE2-NEXT: movl %esi, %edx
-; X32-SSE2-NEXT: addl $4, %esp
; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: popl %edi
; X32-SSE2-NEXT: popl %ebx
@@ -93,28 +87,18 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
;
; X64-AVX2-LABEL: fshl_i37:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: movq %rdx, %r8
-; X64-AVX2-NEXT: movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT: andq %rax, %rsi
-; X64-AVX2-NEXT: andq %rax, %r8
-; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rcx # imm = 0xDD67C8A60DD67C8B
-; X64-AVX2-NEXT: movq %r8, %rax
-; X64-AVX2-NEXT: mulq %rcx
+; X64-AVX2-NEXT: movq %rdx, %rcx
+; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT: movq %rcx, %rax
+; X64-AVX2-NEXT: mulq %rdx
; X64-AVX2-NEXT: shrq $5, %rdx
-; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax
-; X64-AVX2-NEXT: leaq (%rdx,%rax,4), %rax
-; X64-AVX2-NEXT: subq %rax, %r8
+; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax
+; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax
+; X64-AVX2-NEXT: subl %eax, %ecx
+; X64-AVX2-NEXT: shlq $27, %rsi
+; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-AVX2-NEXT: shldq %cl, %rsi, %rdi
; X64-AVX2-NEXT: movq %rdi, %rax
-; X64-AVX2-NEXT: movl %r8d, %ecx
-; X64-AVX2-NEXT: shlq %cl, %rax
-; X64-AVX2-NEXT: movl $37, %ecx
-; X64-AVX2-NEXT: subl %r8d, %ecx
-; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT: shrq %cl, %rsi
-; X64-AVX2-NEXT: orq %rax, %rsi
-; X64-AVX2-NEXT: testq %r8, %r8
-; X64-AVX2-NEXT: cmoveq %rdi, %rsi
-; X64-AVX2-NEXT: movq %rsi, %rax
; X64-AVX2-NEXT: retq
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
@@ -235,41 +219,41 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X32-SSE2-NEXT: pushl %ebx
; X32-SSE2-NEXT: pushl %edi
; X32-SSE2-NEXT: pushl %esi
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT: andl $31, %esi
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: andl $31, %eax
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT: shldl $1, %edi, %esi
+; X32-SSE2-NEXT: addl %edi, %edi
; X32-SSE2-NEXT: pushl $0
; X32-SSE2-NEXT: pushl $37
-; X32-SSE2-NEXT: pushl %eax
+; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
; X32-SSE2-NEXT: calll __umoddi3
; X32-SSE2-NEXT: addl $16, %esp
-; X32-SSE2-NEXT: movl %eax, %ebx
-; X32-SSE2-NEXT: movb $37, %cl
-; X32-SSE2-NEXT: subb %bl, %cl
-; X32-SSE2-NEXT: movl %ebp, %eax
-; X32-SSE2-NEXT: shll %cl, %ebp
-; X32-SSE2-NEXT: shldl %cl, %eax, %edi
+; X32-SSE2-NEXT: addb $27, %al
+; X32-SSE2-NEXT: movl %eax, %edx
+; X32-SSE2-NEXT: notb %dl
+; X32-SSE2-NEXT: movl %edx, %ecx
+; X32-SSE2-NEXT: shldl %cl, %edi, %esi
+; X32-SSE2-NEXT: shldl $27, %ebp, %ebx
+; X32-SSE2-NEXT: shll $27, %ebp
+; X32-SSE2-NEXT: movl %eax, %ecx
+; X32-SSE2-NEXT: shrdl %cl, %ebx, %ebp
+; X32-SSE2-NEXT: shrl %cl, %ebx
+; X32-SSE2-NEXT: xorl %ecx, %ecx
+; X32-SSE2-NEXT: testb $32, %al
+; X32-SSE2-NEXT: cmovnel %ebx, %ebp
+; X32-SSE2-NEXT: cmovnel %ecx, %ebx
; X32-SSE2-NEXT: xorl %eax, %eax
-; X32-SSE2-NEXT: testb $32, %cl
-; X32-SSE2-NEXT: cmovnel %ebp, %edi
-; X32-SSE2-NEXT: cmovnel %eax, %ebp
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movl %ebx, %ecx
-; X32-SSE2-NEXT: shrdl %cl, %esi, %eax
-; X32-SSE2-NEXT: shrl %cl, %esi
-; X32-SSE2-NEXT: testb $32, %bl
-; X32-SSE2-NEXT: cmovnel %esi, %eax
-; X32-SSE2-NEXT: movl $0, %ecx
-; X32-SSE2-NEXT: cmovnel %ecx, %esi
-; X32-SSE2-NEXT: orl %edi, %esi
-; X32-SSE2-NEXT: orl %ebp, %eax
-; X32-SSE2-NEXT: orl %ebx, %edx
-; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT: movl %edx, %ecx
+; X32-SSE2-NEXT: shll %cl, %edi
+; X32-SSE2-NEXT: testb $32, %dl
+; X32-SSE2-NEXT: cmovnel %edi, %esi
+; X32-SSE2-NEXT: cmovnel %eax, %edi
+; X32-SSE2-NEXT: orl %ebp, %edi
+; X32-SSE2-NEXT: orl %ebx, %esi
+; X32-SSE2-NEXT: movl %edi, %eax
; X32-SSE2-NEXT: movl %esi, %edx
; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: popl %edi
@@ -279,28 +263,19 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
;
; X64-AVX2-LABEL: fshr_i37:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: movq %rdx, %r8
-; X64-AVX2-NEXT: movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT: movq %rsi, %r9
-; X64-AVX2-NEXT: andq %rax, %r9
-; X64-AVX2-NEXT: andq %rax, %r8
-; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rcx # imm = 0xDD67C8A60DD67C8B
-; X64-AVX2-NEXT: movq %r8, %rax
-; X64-AVX2-NEXT: mulq %rcx
+; X64-AVX2-NEXT: movq %rdx, %rcx
+; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT: movq %rcx, %rax
+; X64-AVX2-NEXT: mulq %rdx
; X64-AVX2-NEXT: shrq $5, %rdx
-; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax
-; X64-AVX2-NEXT: leaq (%rdx,%rax,4), %rax
-; X64-AVX2-NEXT: subq %rax, %r8
-; X64-AVX2-NEXT: movl %r8d, %ecx
-; X64-AVX2-NEXT: shrq %cl, %r9
-; X64-AVX2-NEXT: movl $37, %ecx
-; X64-AVX2-NEXT: subl %r8d, %ecx
-; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT: shlq %cl, %rdi
-; X64-AVX2-NEXT: orq %r9, %rdi
-; X64-AVX2-NEXT: testq %r8, %r8
-; X64-AVX2-NEXT: cmoveq %rsi, %rdi
-; X64-AVX2-NEXT: movq %rdi, %rax
+; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax
+; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax
+; X64-AVX2-NEXT: subl %eax, %ecx
+; X64-AVX2-NEXT: addl $27, %ecx
+; X64-AVX2-NEXT: shlq $27, %rsi
+; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-AVX2-NEXT: shrdq %cl, %rdi, %rsi
+; X64-AVX2-NEXT: movq %rsi, %rax
; X64-AVX2-NEXT: retq
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index b8755a23e976..a98dfa8f11c1 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -27,154 +27,125 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlq %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psllq %xmm4, %xmm5
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
-; SSE2-NEXT: psubq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT: psrlq %xmm3, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; SSE2-NEXT: orpd %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: psllq %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psllq %xmm5, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
-; SSE41-NEXT: psubq %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pandn %xmm3, %xmm4
+; SSE41-NEXT: psrlq $1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlq %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE41-NEXT: psrlq %xmm0, %xmm1
+; SSE41-NEXT: psrlq %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE41-NEXT: psrlq %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE41-NEXT: psllq %xmm2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
@@ -184,53 +155,46 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm3
-; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pandn %xmm3, %xmm4
+; X32-SSE-NEXT: psrlq $1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlq %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pand %xmm3, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllq %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm5
-; X32-SSE-NEXT: psllq %xmm4, %xmm5
-; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
-; X32-SSE-NEXT: psubq %xmm2, %xmm3
-; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: psrlq %xmm3, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; X32-SSE-NEXT: psrlq %xmm3, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X32-SSE-NEXT: orpd %xmm5, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; X32-SSE-NEXT: pand %xmm3, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
ret <2 x i64> %res
@@ -239,187 +203,157 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrld %xmm3, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrld %xmm6, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psrld %xmm5, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
+; SSE2-NEXT: psrld %xmm3, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrld %xmm7, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psrld %xmm6, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrld %xmm0, %xmm5
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pandn %xmm8, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psrld %xmm7, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm4, %xmm6
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: pmulld %xmm3, %xmm2
-; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm3
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
-; AVX1-NEXT: vpmulld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
@@ -429,67 +363,61 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm3
-; XOPAVX1-NEXT: vpsubd {{.*}}(%rip), %xmm2, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v4i32:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v4i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
-; X32-SSE-NEXT: psubd %xmm2, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm5
-; X32-SSE-NEXT: psrld %xmm3, %xmm5
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: psrld %xmm6, %xmm3
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm5
+; X32-SSE-NEXT: pandn %xmm4, %xmm5
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: psrld $1, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm6
-; X32-SSE-NEXT: psrld %xmm5, %xmm6
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
+; X32-SSE-NEXT: psrld %xmm3, %xmm6
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: psrld %xmm7, %xmm3
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm7
+; X32-SSE-NEXT: psrld %xmm6, %xmm7
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: psrld %xmm5, %xmm1
+; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm2
; X32-SSE-NEXT: pslld $23, %xmm2
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm5, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X32-SSE-NEXT: por %xmm3, %xmm4
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: pandn %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: retl
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
ret <4 x i32> %res
@@ -498,252 +426,233 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT: psubw %xmm2, %xmm3
-; SSE2-NEXT: psllw $12, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: psllw $12, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: psraw $15, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: psrlw $9, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpeqw %xmm2, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE2-NEXT: pmullw %xmm0, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
+; SSE2-NEXT: pmullw %xmm0, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
-; SSE41-NEXT: psubw %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pandn %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psllw $12, %xmm4
; SSE41-NEXT: psllw $4, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: paddw %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlw $8, %xmm5
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlw $4, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $9, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $4, %xmm6
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlw $2, %xmm5
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $2, %xmm6
; SSE41-NEXT: paddw %xmm4, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlw $1, %xmm5
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm6
; SSE41-NEXT: paddw %xmm4, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm5, %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm6
-; SSE41-NEXT: pslld $23, %xmm4
-; SSE41-NEXT: paddd %xmm5, %xmm4
-; SSE41-NEXT: cvttps2dq %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm6, %xmm2
-; SSE41-NEXT: pmullw %xmm3, %xmm2
-; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm4, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT: pslld $23, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
+; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT: packusdw %xmm2, %xmm0
+; SSE41-NEXT: pmullw %xmm0, %xmm3
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $12, %xmm3, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
-; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
-; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
-; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm3
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512F-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512VL-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpmovdw %ymm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
@@ -753,76 +662,74 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; XOP-LABEL: var_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm3
-; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm4
-; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: psubw %xmm2, %xmm3
-; X32-SSE-NEXT: psllw $12, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: psraw $15, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm1, %xmm5
-; X32-SSE-NEXT: psrlw $8, %xmm1
-; X32-SSE-NEXT: pand %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: paddw %xmm3, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: psraw $15, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm1, %xmm5
-; X32-SSE-NEXT: psrlw $4, %xmm1
-; X32-SSE-NEXT: pand %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: paddw %xmm3, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: psraw $15, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm1, %xmm5
-; X32-SSE-NEXT: psrlw $2, %xmm1
-; X32-SSE-NEXT: pand %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: paddw %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm2, %xmm4
+; X32-SSE-NEXT: psllw $12, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm3
; X32-SSE-NEXT: psraw $15, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm1, %xmm4
-; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlw $1, %xmm5
+; X32-SSE-NEXT: psrlw $9, %xmm1
; X32-SSE-NEXT: pand %xmm3, %xmm1
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm5
-; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pslld $23, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm6, %xmm2
-; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; X32-SSE-NEXT: pslld $23, %xmm3
-; X32-SSE-NEXT: paddd %xmm6, %xmm3
-; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X32-SSE-NEXT: pmullw %xmm0, %xmm3
-; X32-SSE-NEXT: por %xmm4, %xmm3
+; X32-SSE-NEXT: pandn %xmm5, %xmm3
; X32-SSE-NEXT: por %xmm1, %xmm3
-; X32-SSE-NEXT: pand %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: pandn %xmm3, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm3
+; X32-SSE-NEXT: por %xmm5, %xmm3
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: pandn %xmm3, %xmm5
+; X32-SSE-NEXT: psrlw $2, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm3
+; X32-SSE-NEXT: por %xmm5, %xmm3
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: psraw $15, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
; X32-SSE-NEXT: pandn %xmm3, %xmm5
-; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: psrlw $1, %xmm3
+; X32-SSE-NEXT: pand %xmm4, %xmm3
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pslld $23, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm4, %xmm1
+; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: paddd %xmm4, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm1
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
+; X32-SSE-NEXT: pmullw %xmm0, %xmm1
+; X32-SSE-NEXT: por %xmm5, %xmm1
+; X32-SSE-NEXT: por %xmm3, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
ret <8 x i16> %res
@@ -831,344 +738,337 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: psllw $5, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllw $4, %xmm3
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm6
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm3, %xmm7
-; SSE2-NEXT: psllw $2, %xmm3
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: pandn %xmm3, %xmm8
-; SSE2-NEXT: paddb %xmm3, %xmm3
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT: psubb %xmm2, %xmm6
-; SSE2-NEXT: psllw $5, %xmm6
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm5
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddb %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pandn %xmm1, %xmm7
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: paddb %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: paddb %xmm7, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: por %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm7, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm2, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pandn %xmm4, %xmm0
; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: psllw $4, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $4, %xmm6
; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psllw $2, %xmm6
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $2, %xmm6
; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: paddb %xmm2, %xmm6
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm6
+; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: psllw $5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: paddb %xmm4, %xmm6
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psrlw $4, %xmm7
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: psllw $5, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: paddb %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psllw $4, %xmm5
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm4, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $2, %xmm4
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $1, %xmm4
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT: paddb %xmm6, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_funnnel_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm4
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm4
-; AVX-NEXT: vpsllw $2, %xmm4, %xmm5
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
-; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
-; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
-; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
+; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX-NEXT: vpand %xmm5, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6
; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX-NEXT: vpsrlw $2, %xmm1, %xmm6
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $1, %xmm1, %xmm6
+; AVX-NEXT: vpand %xmm5, %xmm6, %xmm5
+; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm4
; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm2, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOP-LABEL: var_funnnel_v16i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm3
-; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4
+; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOP-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4
; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v16i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm4
-; X32-SSE-NEXT: psllw $5, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psllw $4, %xmm3
-; X32-SSE-NEXT: pand %xmm5, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT: por %xmm5, %xmm3
-; X32-SSE-NEXT: paddb %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm3, %xmm6
-; X32-SSE-NEXT: psllw $2, %xmm3
-; X32-SSE-NEXT: pand %xmm5, %xmm3
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT: por %xmm6, %xmm3
-; X32-SSE-NEXT: paddb %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm4
-; X32-SSE-NEXT: pandn %xmm3, %xmm4
-; X32-SSE-NEXT: paddb %xmm3, %xmm3
-; X32-SSE-NEXT: pand %xmm5, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT: psubb %xmm2, %xmm5
-; X32-SSE-NEXT: psllw $5, %xmm5
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-SSE-NEXT: pand %xmm5, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm6
+; X32-SSE-NEXT: pandn %xmm4, %xmm6
+; X32-SSE-NEXT: psllw $5, %xmm6
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm6, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm7
; X32-SSE-NEXT: pandn %xmm1, %xmm7
; X32-SSE-NEXT: psrlw $4, %xmm1
-; X32-SSE-NEXT: pand %xmm6, %xmm1
+; X32-SSE-NEXT: pand %xmm3, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm7, %xmm1
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: paddb %xmm6, %xmm6
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm6, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm7
; X32-SSE-NEXT: pandn %xmm1, %xmm7
; X32-SSE-NEXT: psrlw $2, %xmm1
-; X32-SSE-NEXT: pand %xmm6, %xmm1
+; X32-SSE-NEXT: pand %xmm3, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm7, %xmm1
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqb %xmm6, %xmm2
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm5
-; X32-SSE-NEXT: pandn %xmm1, %xmm5
+; X32-SSE-NEXT: paddb %xmm6, %xmm6
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm6, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm6
+; X32-SSE-NEXT: pand %xmm5, %xmm3
+; X32-SSE-NEXT: pandn %xmm1, %xmm6
; X32-SSE-NEXT: psrlw $1, %xmm1
-; X32-SSE-NEXT: pand %xmm6, %xmm1
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: por %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm3, %xmm1
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: pand %xmm1, %xmm3
+; X32-SSE-NEXT: por %xmm6, %xmm3
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: psllw $5, %xmm2
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: por %xmm3, %xmm2
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
@@ -1182,142 +1082,121 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllq %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
-; SSE2-NEXT: psubq %xmm2, %xmm4
-; SSE2-NEXT: psrlq %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlq %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm3, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: psllq %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psllq %xmm2, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
-; SSE41-NEXT: psubq %xmm2, %xmm0
-; SSE41-NEXT: psrlq %xmm0, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: psrlq $1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrlq %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE41-NEXT: psrlq %xmm3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: psllq %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VBMI2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
@@ -1328,53 +1207,49 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm3
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; X32-SSE-NEXT: pand %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,0,64,0]
-; X32-SSE-NEXT: psubq %xmm3, %xmm4
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pandn %xmm3, %xmm4
+; X32-SSE-NEXT: psrlq $1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlq %xmm4, %xmm5
; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm4, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pand %xmm3, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllq %xmm2, %xmm3
-; X32-SSE-NEXT: por %xmm1, %xmm3
-; X32-SSE-NEXT: pand %xmm5, %xmm0
-; X32-SSE-NEXT: pandn %xmm3, %xmm5
-; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
@@ -1384,164 +1259,158 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: xorps %xmm4, %xmm4
-; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pslld %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrld %xmm4, %xmm6
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm3, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[0,3]
; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: movl $32, %ecx
-; SSE2-NEXT: subl %eax, %ecx
-; SSE2-NEXT: movd %ecx, %xmm4
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: andl $31, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pslld %xmm1, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pslld %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: psrld %xmm0, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
+; SSE41-NEXT: pandn %xmm8, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrld %xmm7, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm3, %xmm6
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; SSE41-NEXT: pand %xmm8, %xmm2
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: pslld %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
@@ -1552,56 +1421,57 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v4i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: xorps %xmm4, %xmm4
-; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm5
-; X32-SSE-NEXT: pslld %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: psrld $1, %xmm1
+; X32-SSE-NEXT: pandn {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrld %xmm4, %xmm5
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm6
+; X32-SSE-NEXT: psrld %xmm4, %xmm6
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrld %xmm4, %xmm5
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: psrld %xmm3, %xmm1
+; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1]
+; X32-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[0,3]
; X32-SSE-NEXT: movd %xmm2, %eax
-; X32-SSE-NEXT: movl $32, %ecx
-; X32-SSE-NEXT: subl %eax, %ecx
-; X32-SSE-NEXT: movd %ecx, %xmm4
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: andl $31, %eax
+; X32-SSE-NEXT: movd %eax, %xmm1
+; X32-SSE-NEXT: pslld %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
@@ -1611,161 +1481,205 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v8i16:
; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
+; SSE2-NEXT: psllw $12, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: psraw $15, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: psrlw $9, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT: psubw %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psllw %xmm2, %xmm5
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm3, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqw %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: psllw %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psllw %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
-; SSE41-NEXT: psubw %xmm2, %xmm0
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: psrlw %xmm0, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm4
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT: pandn %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psllw $12, %xmm4
+; SSE41-NEXT: psllw $4, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $9, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $4, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $2, %xmm6
+; SSE41-NEXT: paddw %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm6
+; SSE41-NEXT: paddw %xmm4, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT: psllw %xmm0, %xmm3
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5
+; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT: vpsrlvd %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvw %xmm3, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
@@ -1776,56 +1690,77 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
+; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm3
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v8i16:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm2, %xmm3
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
+; X32-SSE-NEXT: psllw $12, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm3
+; X32-SSE-NEXT: psraw $15, %xmm3
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlw $1, %xmm5
+; X32-SSE-NEXT: psrlw $9, %xmm1
+; X32-SSE-NEXT: pand %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm5, %xmm3
+; X32-SSE-NEXT: por %xmm1, %xmm3
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: pandn %xmm3, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm3
+; X32-SSE-NEXT: por %xmm5, %xmm3
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: pandn %xmm3, %xmm5
+; X32-SSE-NEXT: psrlw $2, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm3
+; X32-SSE-NEXT: por %xmm5, %xmm3
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: psraw $15, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm1
+; X32-SSE-NEXT: pandn %xmm3, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm3
+; X32-SSE-NEXT: pand %xmm4, %xmm3
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: psubw %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: movdqa %xmm0, %xmm5
-; X32-SSE-NEXT: psllw %xmm2, %xmm5
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm3, %xmm1
-; X32-SSE-NEXT: por %xmm5, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm3
+; X32-SSE-NEXT: psllw %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -1836,298 +1771,350 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psllw %xmm3, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: psllw %xmm3, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT: psubb %xmm2, %xmm4
-; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm4, %xmm1
-; SSE2-NEXT: psrlw %xmm4, %xmm5
-; SSE2-NEXT: psrlw $8, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,0,0]
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: pandn %xmm8, %xmm6
+; SSE2-NEXT: psllw $5, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: paddb %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pandn %xmm1, %xmm7
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: paddb %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: psllw %xmm2, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pshufb %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT: pandn %xmm5, %xmm0
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT: pand %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: psrlw $4, %xmm7
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: psrlw $2, %xmm7
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: psrlw $1, %xmm7
+; SSE41-NEXT: pand %xmm6, %xmm7
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm2
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psllw %xmm0, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: psllw %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pshufb %xmm0, %xmm6
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm2, %xmm6
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm6, %xmm1
-; SSE41-NEXT: psrlw %xmm6, %xmm5
-; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm1, %xmm5
-; SSE41-NEXT: por %xmm5, %xmm4
-; SSE41-NEXT: pshufb %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: psllw %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm0, %xmm2
+; SSE41-NEXT: pshufb %xmm4, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpandn %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm7
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm7, %xmm7
+; AVX1-NEXT: vpblendvb %xmm4, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm7
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm7, %xmm7
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm7
+; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm4
-; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
-; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
-; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX2-NEXT: vpblendvb %xmm3, %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm6
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpblendvb %xmm3, %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm6
+; AVX2-NEXT: vpand %xmm5, %xmm6, %xmm5
+; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpsllw %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpslld %xmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpslld %xmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm2, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm2, %xmm5
+; XOPAVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm3
-; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4
+; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v16i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psllw %xmm3, %xmm4
-; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
-; X32-SSE-NEXT: psllw %xmm3, %xmm6
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm4, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT: psubb %xmm2, %xmm4
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm4, %xmm1
-; X32-SSE-NEXT: psrlw %xmm4, %xmm5
-; X32-SSE-NEXT: psrlw $8, %xmm5
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm4
-; X32-SSE-NEXT: por %xmm3, %xmm4
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT: pandn %xmm3, %xmm5
+; X32-SSE-NEXT: psllw $5, %xmm5
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: pandn %xmm1, %xmm7
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm6, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm7, %xmm1
+; X32-SSE-NEXT: paddb %xmm5, %xmm5
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: pandn %xmm1, %xmm7
+; X32-SSE-NEXT: psrlw $2, %xmm1
+; X32-SSE-NEXT: pand %xmm6, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm7, %xmm1
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: paddb %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT: movdqa %xmm6, %xmm5
+; X32-SSE-NEXT: pand %xmm4, %xmm6
+; X32-SSE-NEXT: pandn %xmm1, %xmm5
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: pand %xmm1, %xmm6
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm2, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT: psllw %xmm2, %xmm1
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm4, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: por %xmm5, %xmm1
+; X32-SSE-NEXT: por %xmm6, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
@@ -2514,13 +2501,21 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; X32-SSE-LABEL: constant_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrlq $60, %xmm2
-; X32-SSE-NEXT: psrlq $50, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u>
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psrlq $1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlq %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pand %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq $4, %xmm2
-; X32-SSE-NEXT: psllq $14, %xmm0
+; X32-SSE-NEXT: psllq %xmm3, %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm3, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -2669,87 +2664,68 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
}
define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
-; SSE2-LABEL: constant_funnnel_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; SSE2-NEXT: pmulhuw %xmm2, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_funnnel_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; SSE41-NEXT: pmulhuw %xmm2, %xmm1
-; SSE41-NEXT: pmullw %xmm0, %xmm2
-; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_funnnel_v8i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: constant_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512VL-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,15,14,13,12,11,10,9>
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,15,14,13,12,11,10,9>
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
-; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm2
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
@@ -2759,23 +2735,18 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
;
; XOP-LABEL: constant_funnnel_v8i16:
; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm2
-; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_funnnel_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; X32-SSE-NEXT: pmulhuw %xmm2, %xmm1
-; X32-SSE-NEXT: pmullw %xmm0, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: por %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
ret <8 x i16> %res
@@ -2784,229 +2755,209 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; SSE2-LABEL: constant_funnnel_v16i8:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,128,64,32,16,8,4,2>
-; SSE2-NEXT: pmullw %xmm4, %xmm3
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = <u,2,4,8,16,32,64,128>
-; SSE2-NEXT: pmullw %xmm5, %xmm1
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm5, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,128,64,32,16,8,4,2>
-; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm0, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
-; SSE41-NEXT: pmullw %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: packuswb %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: pmullw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm0, %xmm5
-; SSE41-NEXT: packuswb %xmm1, %xmm5
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v16i8:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,128,64,32,16,8,4,2>
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
-; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v16i8:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v16i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v16i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vpord %zmm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; AVX512VBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VBMI2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: movw $257, %ax # imm = 0x101
-; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOP-LABEL: constant_funnnel_v16i8:
; XOP: # %bb.0:
+; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm2
-; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_funnnel_v16i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,128,64,32,16,8,4,2>
-; X32-SSE-NEXT: pmullw %xmm4, %xmm3
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
; X32-SSE-NEXT: psrlw $8, %xmm3
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = <u,2,4,8,16,32,64,128>
-; X32-SSE-NEXT: pmullw %xmm5, %xmm1
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: psrlw $8, %xmm1
; X32-SSE-NEXT: packuswb %xmm3, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE-NEXT: pmullw %xmm4, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; X32-SSE-NEXT: pand %xmm4, %xmm3
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pmullw %xmm5, %xmm2
-; X32-SSE-NEXT: pand %xmm4, %xmm2
-; X32-SSE-NEXT: packuswb %xmm3, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <16 x i8> %res
@@ -3081,8 +3032,10 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwi
; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlq $50, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
; X32-SSE-NEXT: psllq $14, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
ret <2 x i64> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 5e701ff4a17c..3c621b6aeac4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -22,122 +22,100 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpsllq %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [64,64]
-; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6
-; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
@@ -147,39 +125,36 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm5
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm6
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT: vpshlq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpshlq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v4i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; XOPAVX2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
ret <4 x i64> %res
@@ -188,137 +163,116 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm6
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm5
-; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7
+; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm8, %xmm5
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
-; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vpslld $23, %xmm3, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm6
-; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm9, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
@@ -328,39 +282,36 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm5
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm6
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT: vpshld %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpshld %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshld %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v8i32:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
ret <8 x i32> %res
@@ -369,180 +320,162 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm3, %xmm8, %xmm5
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6
; AVX1-NEXT: vpsllw $4, %xmm5, %xmm5
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm4
-; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5
-; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
-; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsubw %xmm2, %xmm8, %xmm5
-; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6
-; AVX1-NEXT: vpsllw $4, %xmm5, %xmm5
-; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm7
-; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlw $9, %xmm7, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm7
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1
-; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
-; AVX2-NEXT: vpsllvd %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX2-NEXT: vpsllvd %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX2-NEXT: vpsrlvd %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX2-NEXT: vpsrlvd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
+; AVX2-NEXT: vpsrlvd %ymm6, %ymm7, %ymm6
+; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
+; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpackusdw %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
+; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
@@ -552,49 +485,47 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; XOPAVX1-LABEL: var_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm5
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm5, %xmm4, %xmm6
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT: vpshlw %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpshlw %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm4
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT: vpsubw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT: vpshlw %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
ret <16 x i16> %res
@@ -603,297 +534,288 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm6
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm3, %xmm9, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm6
-; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm7
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpsubb %xmm2, %xmm9, %xmm6
-; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
; AVX1-NEXT: vpand %xmm5, %xmm10, %xmm5
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpsllw $5, %xmm8, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6
; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm4, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm3, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsllw $2, %xmm4, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm2, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpsllw $2, %xmm5, %xmm7
-; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %ymm2, %ymm9, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $2, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm5
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4
; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm8, %ymm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4
-; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm4
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm6
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm6
+; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
+; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: var_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm5
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm5, %xmm4, %xmm6
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT: vpshlb %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT: vpsubb %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vandnps %ymm8, %ymm2, %ymm6
+; XOPAVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubb %xmm7, %xmm5, %xmm7
+; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpsubb %xmm6, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT: vpshlb %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
ret <32 x i8> %res
@@ -906,114 +828,98 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip){1to4}, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512BW-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512VBMI2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512VBMI2-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip){1to4}, %ymm3, %ymm2
+; AVX512VLBW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
@@ -1024,38 +930,37 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm3
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; XOPAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
%res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
@@ -1065,128 +970,122 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v8i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
+; AVX1-NEXT: vpsrld %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
+; AVX1-NEXT: vpsrld %xmm7, %xmm6, %xmm7
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; AVX1-NEXT: vpsrld %xmm7, %xmm6, %xmm7
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
@@ -1197,42 +1096,39 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT: vpslld %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshld %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3
; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
%res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
@@ -1242,126 +1138,137 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsrlw $9, %xmm6, %xmm7
+; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm4, %xmm7, %xmm6, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm6
+; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5
+; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm5
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX2-NEXT: vpsrlvd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
@@ -1372,42 +1279,46 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; XOPAVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm6
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT: vpsubw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
%res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
@@ -1417,229 +1328,260 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v32i8:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm4, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlw $4, %xmm6, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm6
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm6
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm10, %xmm5
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm5
+; AVX2-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm4
; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpsllw %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpsllw %xmm2, %xmm3, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512F-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm3, %ymm4
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm5
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm1, %ymm4, %ymm1
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vandps %ymm4, %ymm2, %ymm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; XOPAVX1-NEXT: vpshlb %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandnps %ymm4, %ymm2, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpshlb %xmm6, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT: vpshlb %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
%res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
@@ -2137,86 +2079,77 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX1-LABEL: constant_funnnel_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
-; AVX1-NEXT: vpmulhuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7]
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1>
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1>
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512VBMI2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm2
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VLBW-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
@@ -2226,26 +2159,27 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
;
; XOPAVX1-LABEL: constant_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
+; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; XOPAVX1-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; XOPAVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2
-; XOPAVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
+; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
ret <16 x i16> %res
@@ -2254,49 +2188,50 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-LABEL: constant_funnnel_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
-; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,128,64,32,16,8,4,2>
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,2,4,8,16,32,64,128>
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2,256,128,64,32,16,8,4]
+; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [2,4,8,16,32,64,128,256]
+; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; AVX1-NEXT: vpmullw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v32i8:
@@ -2305,25 +2240,25 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsllw $2, %ymm2, %ymm4
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v32i8:
@@ -2332,25 +2267,25 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i8:
@@ -2359,110 +2294,108 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
-; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip){1to4}, %ymm1, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VBMI2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: movl $16843009, %eax # imm = 0x1010101
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: movl $16843009, %eax # imm = 0x1010101
-; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: constant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,249,250,251,252,253,254,255,u,255,254,253,252,251,250,249>
+; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; XOPAVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_funnnel_v32i8:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,249,250,251,252,253,254,255,u,255,254,253,252,251,250,249>
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm3
-; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; XOPAVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 60406c45ba89..62660f3186ff 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -18,41 +18,35 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i64:
@@ -62,15 +56,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
@@ -84,41 +76,35 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i32:
@@ -128,15 +114,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
@@ -150,85 +134,77 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpsllvd %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6
-; AVX512F-NEXT: vpmovdw %zmm6, %ymm6
-; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlvd %zmm5, %zmm6, %zmm5
+; AVX512F-NEXT: vpmovdw %zmm5, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm6, %zmm4, %zmm4
-; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6
-; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6
-; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm6, %zmm5
+; AVX512VL-NEXT: vpmovdw %zmm5, %ymm5
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm4, %ymm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i16:
@@ -238,15 +214,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
@@ -260,298 +234,296 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8
-; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6
-; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpandnq %zmm7, %zmm2, %zmm8
+; AVX512F-NEXT: vextracti64x4 $1, %zmm8, %ymm9
+; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm10, %ymm5
+; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
; AVX512F-NEXT: vpand %ymm5, %ymm10, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5
-; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpandq %zmm7, %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8
-; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
-; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
-; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
-; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6
-; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpandnq %zmm7, %zmm2, %zmm8
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm8, %ymm9
+; AVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm6
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5
-; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm6
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512BW-NEXT: vpandq %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm6
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm6
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm6
+; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm6
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm6
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm6
+; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VLBW-NEXT: vpandq %zmm5, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm6
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512VLBW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm6
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm6
+; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpandnq %zmm3, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm6
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm6
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm6
+; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
ret <64 x i8> %res
@@ -564,46 +536,35 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
@@ -614,16 +575,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
@@ -639,52 +597,41 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpsrld $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrld $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
@@ -695,18 +642,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrld $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
@@ -722,64 +666,68 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm3
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
@@ -790,18 +738,14 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm3
+; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
@@ -817,158 +761,230 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512F-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm6, %ymm6
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512F-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512F-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm4
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm7
+; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw $5, %ymm7, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512VL-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512VL-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm4
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; AVX512VL-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm7
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
-; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm5
+; AVX512BW-NEXT: vpsllw $5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm5
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VLBW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm5
+; AVX512VLBW-NEXT: vpsllw $5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm5
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm3, %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
@@ -1067,46 +1083,51 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u>
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7],ymm4[8,9,10,11,12,13,14],ymm2[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u>
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7],ymm4[8,9,10,11,12,13,14],ymm2[15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512VL-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
-; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
@@ -1116,13 +1137,10 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
;
; AVX512VLBW-LABEL: constant_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VLBW-NEXT: movl $65537, %eax # imm = 0x10001
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
@@ -1153,36 +1171,40 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,256,128,64,32,16,8,4,2,256,128,64,32,16,8,4]
; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
; AVX512F-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpmullw %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vpternlogq $216, {{.*}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
@@ -1204,36 +1226,40 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,256,128,64,32,16,8,4,2,256,128,64,32,16,8,4]
; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
; AVX512VL-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmullw %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm3, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
-; AVX512VL-NEXT: vpmullw %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
; AVX512VL-NEXT: vpmullw %ymm7, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
@@ -1243,28 +1269,26 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
@@ -1274,28 +1298,26 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
-; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VBMI2-NEXT: kmovq %rax, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
@@ -1305,28 +1327,26 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VLBW-NEXT: kmovq %rax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
@@ -1336,28 +1356,26 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
-; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
-; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VLVBMI2-NEXT: kmovq %rax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index a8a03f5c28cc..231b314fa9f4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1256,16 +1256,24 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
;
; X32-SSE-LABEL: constant_funnnel_v2i64:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <4,u,14,u>
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: psubq %xmm2, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: psllq %xmm2, %xmm4
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: psllq %xmm2, %xmm5
+; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT: pand %xmm1, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $60, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq $50, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $4, %xmm1
-; X32-SSE-NEXT: psllq $14, %xmm0
+; X32-SSE-NEXT: psrlq %xmm3, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: orpd %xmm2, %xmm0
+; X32-SSE-NEXT: orpd %xmm5, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>)
ret <2 x i64> %res
@@ -1657,8 +1665,10 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq $50, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
; X32-SSE-NEXT: psllq $14, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>)
ret <2 x i64> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 43b8a49d1776..20033b4dbc9b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -27,154 +27,125 @@ declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: psrlq %xmm4, %xmm5
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
-; SSE2-NEXT: psubq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psllq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT: psllq %xmm3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE2-NEXT: orpd %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: psllq $1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: psllq %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrlq %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
-; SSE41-NEXT: psubq %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psllq %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE41-NEXT: psllq %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE41-NEXT: psrlq %xmm4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pandn %xmm3, %xmm2
+; SSE41-NEXT: psllq $1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE41-NEXT: psllq %xmm2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
@@ -185,56 +156,46 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
; X32-SSE-NEXT: movdqa %xmm1, %xmm5
; X32-SSE-NEXT: psrlq %xmm4, %xmm5
-; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
-; X32-SSE-NEXT: psubq %xmm2, %xmm3
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psllq %xmm3, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; X32-SSE-NEXT: psllq %xmm3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X32-SSE-NEXT: orpd %xmm5, %xmm0
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; X32-SSE-NEXT: pand %xmm3, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pandn %xmm3, %xmm2
+; X32-SSE-NEXT: psllq $1, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: psllq %xmm2, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
ret <2 x i64> %res
@@ -243,186 +204,157 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrld %xmm5, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psrld %xmm5, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrld %xmm4, %xmm5
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: pslld $23, %xmm4
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
-; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm0
+; SSE2-NEXT: psrld %xmm3, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrld %xmm7, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE2-NEXT: pslld $1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm6, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrld %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm5, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrld %xmm0, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: pslld $23, %xmm0
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE41-NEXT: pmulld %xmm0, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psrld %xmm7, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm4, %xmm6
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pandn %xmm8, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT: pslld $1, %xmm0
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
@@ -433,69 +365,61 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v4i32:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v4i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: psrld %xmm5, %xmm3
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm5
+; X32-SSE-NEXT: pand %xmm4, %xmm5
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm1, %xmm6
-; X32-SSE-NEXT: psrld %xmm5, %xmm6
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm5
-; X32-SSE-NEXT: psrld %xmm4, %xmm5
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
-; X32-SSE-NEXT: psubd %xmm2, %xmm4
-; X32-SSE-NEXT: pslld $23, %xmm4
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
+; X32-SSE-NEXT: psrld %xmm3, %xmm6
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: psrld %xmm7, %xmm3
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm7
+; X32-SSE-NEXT: psrld %xmm6, %xmm7
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT: psrld %xmm5, %xmm1
+; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; X32-SSE-NEXT: pandn %xmm4, %xmm2
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm1
+; X32-SSE-NEXT: pslld $1, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; X32-SSE-NEXT: por %xmm3, %xmm6
-; X32-SSE-NEXT: pxor %xmm0, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm0, %xmm1
-; X32-SSE-NEXT: pandn %xmm6, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: retl
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
ret <4 x i32> %res
@@ -504,254 +428,231 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT: psubw %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqw %xmm2, %xmm3
-; SSE2-NEXT: psllw $12, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: psraw $15, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psrlw $8, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psllw $12, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: paddw %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: psraw $15, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm5, %xmm7
-; SSE2-NEXT: psrlw $4, %xmm5
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: paddw %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: psraw $15, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm5, %xmm7
-; SSE2-NEXT: psrlw $2, %xmm5
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: paddw %xmm2, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: psraw $15, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm5, %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm7, %xmm2
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $23, %xmm4
-; SSE2-NEXT: paddd %xmm7, %xmm4
-; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE2-NEXT: pmullw %xmm0, %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; SSE41-NEXT: psubw %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
-; SSE41-NEXT: psllw $12, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psllw $12, %xmm4
; SSE41-NEXT: psllw $4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrlw $8, %xmm6
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm6
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrlw $4, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrlw $2, %xmm6
-; SSE41-NEXT: paddw %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm6
+; SSE41-NEXT: paddw %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrlw $1, %xmm6
-; SSE41-NEXT: paddw %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pslld $23, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm5
-; SSE41-NEXT: cvttps2dq %xmm5, %xmm5
+; SSE41-NEXT: paddw %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: pandn %xmm5, %xmm2
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm4, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: pslld $23, %xmm0
-; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE41-NEXT: packusdw %xmm5, %xmm0
+; SSE41-NEXT: packusdw %xmm2, %xmm0
+; SSE41-NEXT: psllw $1, %xmm3
; SSE41-NEXT: pmullw %xmm0, %xmm3
-; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: por %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
-; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
-; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
-; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
-; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
-; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
-; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlvw %xmm2, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
@@ -762,79 +663,72 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; XOP-LABEL: var_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
-; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
-; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: psubw %xmm2, %xmm4
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT: psllw $12, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm5
-; X32-SSE-NEXT: psraw $15, %xmm5
-; X32-SSE-NEXT: movdqa %xmm1, %xmm6
-; X32-SSE-NEXT: psrlw $8, %xmm6
-; X32-SSE-NEXT: pand %xmm5, %xmm6
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: psllw $12, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: psraw $15, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
; X32-SSE-NEXT: pandn %xmm1, %xmm5
-; X32-SSE-NEXT: por %xmm6, %xmm5
-; X32-SSE-NEXT: paddw %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $15, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm7
-; X32-SSE-NEXT: pandn %xmm5, %xmm7
-; X32-SSE-NEXT: psrlw $4, %xmm5
-; X32-SSE-NEXT: pand %xmm6, %xmm5
-; X32-SSE-NEXT: por %xmm7, %xmm5
-; X32-SSE-NEXT: paddw %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $15, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm7
-; X32-SSE-NEXT: pandn %xmm5, %xmm7
-; X32-SSE-NEXT: psrlw $2, %xmm5
-; X32-SSE-NEXT: pand %xmm6, %xmm5
-; X32-SSE-NEXT: por %xmm7, %xmm5
-; X32-SSE-NEXT: paddw %xmm2, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm6
-; X32-SSE-NEXT: pandn %xmm5, %xmm6
-; X32-SSE-NEXT: psrlw $1, %xmm5
-; X32-SSE-NEXT: pand %xmm2, %xmm5
-; X32-SSE-NEXT: movdqa %xmm4, %xmm2
-; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: por %xmm5, %xmm1
+; X32-SSE-NEXT: paddw %xmm3, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: psraw $15, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm1, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: por %xmm5, %xmm1
+; X32-SSE-NEXT: paddw %xmm3, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: psraw $15, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm1, %xmm5
+; X32-SSE-NEXT: psrlw $2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: por %xmm5, %xmm1
+; X32-SSE-NEXT: paddw %xmm3, %xmm3
+; X32-SSE-NEXT: psraw $15, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm1, %xmm4
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: pand %xmm3, %xmm1
+; X32-SSE-NEXT: pandn {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pslld $23, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm5, %xmm3
+; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X32-SSE-NEXT: pslld $23, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm7, %xmm2
+; X32-SSE-NEXT: paddd %xmm5, %xmm2
; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
-; X32-SSE-NEXT: pslld $23, %xmm4
-; X32-SSE-NEXT: paddd %xmm7, %xmm4
-; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; X32-SSE-NEXT: pmullw %xmm0, %xmm4
-; X32-SSE-NEXT: por %xmm6, %xmm4
-; X32-SSE-NEXT: por %xmm5, %xmm4
-; X32-SSE-NEXT: pand %xmm3, %xmm1
-; X32-SSE-NEXT: pandn %xmm4, %xmm3
-; X32-SSE-NEXT: por %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT: psllw $1, %xmm0
+; X32-SSE-NEXT: pmullw %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
ret <8 x i16> %res
@@ -843,347 +737,326 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm4, %xmm7
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT: psubb %xmm2, %xmm5
-; SSE2-NEXT: psllw $5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
+; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pandn %xmm0, %xmm7
; SSE2-NEXT: psllw $4, %xmm0
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: paddb %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pandn %xmm0, %xmm7
; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm2, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddb %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: psllw $5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: paddb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm7
-; SSE41-NEXT: psllw $4, %xmm7
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT: psrlw $4, %xmm6
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $2, %xmm6
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psllw $2, %xmm4
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlw $1, %xmm6
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT: paddb %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: pandn %xmm5, %xmm3
+; SSE41-NEXT: psllw $5, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: paddb %xmm3, %xmm4
; SSE41-NEXT: paddb %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psllw $4, %xmm5
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm5
; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_funnnel_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
-; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
-; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
-; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
-; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
-; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
+; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6
; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
-; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
+; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4
; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOP-LABEL: var_funnnel_v16i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
-; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
-; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_funnnel_v16i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm5
-; X32-SSE-NEXT: psllw $5, %xmm5
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pandn %xmm5, %xmm4
+; X32-SSE-NEXT: psllw $5, %xmm4
; X32-SSE-NEXT: pxor %xmm3, %xmm3
; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: psrlw $4, %xmm4
-; X32-SSE-NEXT: pand %xmm6, %xmm4
-; X32-SSE-NEXT: pandn %xmm1, %xmm6
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT: por %xmm6, %xmm4
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm7
-; X32-SSE-NEXT: pandn %xmm4, %xmm7
-; X32-SSE-NEXT: psrlw $2, %xmm4
-; X32-SSE-NEXT: pand %xmm6, %xmm4
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT: por %xmm7, %xmm4
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
-; X32-SSE-NEXT: movdqa %xmm6, %xmm5
-; X32-SSE-NEXT: pandn %xmm4, %xmm5
-; X32-SSE-NEXT: psrlw $1, %xmm4
-; X32-SSE-NEXT: pand %xmm6, %xmm4
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT: por %xmm5, %xmm4
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT: psubb %xmm2, %xmm5
-; X32-SSE-NEXT: psllw $5, %xmm5
-; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm6
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
; X32-SSE-NEXT: movdqa %xmm6, %xmm7
; X32-SSE-NEXT: pandn %xmm0, %xmm7
; X32-SSE-NEXT: psllw $4, %xmm0
; X32-SSE-NEXT: pand %xmm6, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: por %xmm7, %xmm0
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
+; X32-SSE-NEXT: paddb %xmm4, %xmm4
; X32-SSE-NEXT: pxor %xmm6, %xmm6
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm6
; X32-SSE-NEXT: movdqa %xmm6, %xmm7
; X32-SSE-NEXT: pandn %xmm0, %xmm7
; X32-SSE-NEXT: psllw $2, %xmm0
; X32-SSE-NEXT: pand %xmm6, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: por %xmm7, %xmm0
-; X32-SSE-NEXT: paddb %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
-; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: por %xmm4, %xmm5
+; X32-SSE-NEXT: paddb %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm6
+; X32-SSE-NEXT: movdqa %xmm6, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: paddb %xmm0, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: pand %xmm6, %xmm0
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: psllw $5, %xmm2
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm1, %xmm6
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm5, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm6, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm1, %xmm6
+; X32-SSE-NEXT: psrlw $2, %xmm1
+; X32-SSE-NEXT: pand %xmm5, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm6, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm2
+; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $1, %xmm1
+; X32-SSE-NEXT: pand %xmm3, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: por %xmm2, %xmm1
+; X32-SSE-NEXT: por %xmm4, %xmm1
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
ret <16 x i8> %res
@@ -1196,141 +1069,121 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
-; SSE2-NEXT: psubq %xmm2, %xmm4
-; SSE2-NEXT: psllq %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: psllq $1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllq %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE2-NEXT: psllq %xmm3, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: psrlq %xmm2, %xmm1
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
-; SSE41-NEXT: psubq %xmm2, %xmm4
-; SSE41-NEXT: psllq %xmm4, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: psllq $1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psllq %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE41-NEXT: psllq %xmm3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: psrlq %xmm2, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VBMI2-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
@@ -1342,54 +1195,47 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm3
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63]
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm5
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,0,3,2]
-; X32-SSE-NEXT: pand %xmm5, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [64,0,64,0]
-; X32-SSE-NEXT: psubq %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psllq %xmm5, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; X32-SSE-NEXT: psllq %xmm5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: psrlq %xmm2, %xmm4
-; X32-SSE-NEXT: por %xmm0, %xmm4
-; X32-SSE-NEXT: pand %xmm3, %xmm1
-; X32-SSE-NEXT: pandn %xmm4, %xmm3
-; X32-SSE-NEXT: por %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlq %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pandn %xmm3, %xmm2
+; X32-SSE-NEXT: psllq $1, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: psllq %xmm2, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
@@ -1399,163 +1245,134 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: xorps %xmm4, %xmm4
-; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-NEXT: pslld $1, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: movl $32, %ecx
-; SSE2-NEXT: subl %eax, %ecx
-; SSE2-NEXT: movd %ecx, %xmm4
-; SSE2-NEXT: pslld %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: andl $31, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: psrld %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrld %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pslld %xmm0, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: psrld %xmm2, %xmm1
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: pslld $23, %xmm3
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm3
+; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $1, %xmm0
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VBMI2-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
@@ -1567,57 +1384,50 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v4i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: xorps %xmm4, %xmm4
-; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm5
-; X32-SSE-NEXT: psrld %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pslld $1, %xmm0
+; X32-SSE-NEXT: pandn {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT: pslld $23, %xmm3
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm3, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm4, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE-NEXT: movd %xmm2, %eax
-; X32-SSE-NEXT: movl $32, %ecx
-; X32-SSE-NEXT: subl %eax, %ecx
-; X32-SSE-NEXT: movd %ecx, %xmm4
-; X32-SSE-NEXT: pslld %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: andl $31, %eax
+; X32-SSE-NEXT: movd %eax, %xmm2
+; X32-SSE-NEXT: psrld %xmm2, %xmm1
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
@@ -1627,161 +1437,175 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT: psubw %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm6, %xmm5
+; SSE2-NEXT: cvttps2dq %xmm5, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: pmullw %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrlw %xmm2, %xmm5
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm3, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrlw %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
-; SSE41-NEXT: psubw %xmm2, %xmm0
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: psllw %xmm0, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pslld $23, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm6, %xmm3
+; SSE41-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE41-NEXT: pslld $23, %xmm5
+; SSE41-NEXT: paddd %xmm6, %xmm5
+; SSE41-NEXT: cvttps2dq %xmm5, %xmm5
+; SSE41-NEXT: packusdw %xmm3, %xmm5
+; SSE41-NEXT: psllw $1, %xmm0
+; SSE41-NEXT: pmullw %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT: psrlw %xmm2, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
@@ -1793,58 +1617,61 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm3
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: psubw %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: pandn %xmm4, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm5
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pslld $23, %xmm5
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm6, %xmm5
+; X32-SSE-NEXT: cvttps2dq %xmm5, %xmm5
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; X32-SSE-NEXT: pslld $23, %xmm3
+; X32-SSE-NEXT: paddd %xmm6, %xmm3
+; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X32-SSE-NEXT: psllw $1, %xmm0
+; X32-SSE-NEXT: pmullw %xmm3, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm2
; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: movdqa %xmm1, %xmm5
-; X32-SSE-NEXT: psrlw %xmm2, %xmm5
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: por %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw %xmm2, %xmm1
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
@@ -1854,300 +1681,331 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw %xmm3, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: psrlw %xmm3, %xmm6
-; SSE2-NEXT: psrlw $8, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT: psubb %xmm2, %xmm4
-; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm4, %xmm0
-; SSE2-NEXT: psllw %xmm4, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pandn %xmm0, %xmm7
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pandn %xmm0, %xmm7
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: psrlw %xmm2, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pshufb %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT: pandn %xmm5, %xmm0
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddb %xmm0, %xmm4
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psllw $4, %xmm6
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psllw $2, %xmm6
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: paddb %xmm3, %xmm6
+; SSE41-NEXT: paddb %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm2
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw %xmm0, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: psrlw %xmm0, %xmm6
-; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm4, %xmm6
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm2, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psllw %xmm0, %xmm3
-; SSE41-NEXT: psllw %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pshufb %xmm0, %xmm5
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: psrlw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm0, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: por %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm6
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
-; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
-; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
-; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm5
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm6
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX2-NEXT: vpblendvb %xmm3, %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrld %xmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrld %xmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpandn %xmm4, %xmm3, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vpandn %xmm4, %xmm2, %xmm5
+; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: splatvar_funnnel_v16i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: psrlw %xmm3, %xmm4
-; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
-; X32-SSE-NEXT: psrlw %xmm3, %xmm6
-; X32-SSE-NEXT: psrlw $8, %xmm6
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm4, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT: psubb %xmm2, %xmm4
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm4, %xmm0
-; X32-SSE-NEXT: psllw %xmm4, %xmm5
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm0, %xmm4
-; X32-SSE-NEXT: por %xmm3, %xmm4
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm0, %xmm0
-; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm0, %xmm1
-; X32-SSE-NEXT: pandn %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT: pandn %xmm3, %xmm4
+; X32-SSE-NEXT: psllw $5, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm6
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: pandn %xmm0, %xmm7
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm6, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm7, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm6, %xmm6
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm6
+; X32-SSE-NEXT: movdqa %xmm6, %xmm7
+; X32-SSE-NEXT: pandn %xmm0, %xmm7
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm6, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm7, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm5, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm2, %xmm1
+; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
+; X32-SSE-NEXT: psrlw %xmm2, %xmm3
+; X32-SSE-NEXT: psrlw $8, %xmm3
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
@@ -2260,13 +2118,21 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; X32-SSE-LABEL: constant_funnnel_v2i64:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrlq $4, %xmm2
-; X32-SSE-NEXT: psrlq $14, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u>
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pand %xmm2, %xmm4
+; X32-SSE-NEXT: movdqa %xmm1, %xmm5
+; X32-SSE-NEXT: psrlq %xmm4, %xmm5
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; X32-SSE-NEXT: pandn %xmm2, %xmm3
+; X32-SSE-NEXT: psllq $1, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq $60, %xmm2
-; X32-SSE-NEXT: psllq $50, %xmm0
+; X32-SSE-NEXT: psllq %xmm3, %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm3, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -2418,86 +2284,84 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2-LABEL: constant_funnnel_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pmulhuw %xmm2, %xmm3
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pmulhuw %xmm2, %xmm3
-; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; SSE41-NEXT: pmulhuw %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: psllw $1, %xmm0
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_funnnel_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
-; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,15,14,13,12,11,10,9>
-; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
+; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
-; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,15,14,13,12,11,10,9>
-; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
+; AVX512VBMI2-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
@@ -2508,22 +2372,22 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
;
; XOP-LABEL: constant_funnnel_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_funnnel_v8i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: pmulhuw %xmm2, %xmm3
-; X32-SSE-NEXT: pmullw %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm1, %xmm3
+; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: pand %xmm1, %xmm2
+; X32-SSE-NEXT: psllw $1, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2536,191 +2400,170 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
-; SSE2-NEXT: pmullw %xmm4, %xmm3
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = <u,128,64,32,16,8,4,2>
-; SSE2-NEXT: pmullw %xmm2, %xmm5
-; SSE2-NEXT: psrlw $8, %xmm5
-; SSE2-NEXT: packuswb %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
-; SSE41-NEXT: pmullw %xmm2, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,128,64,32,16,8,4,2>
-; SSE41-NEXT: pmullw %xmm5, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: packuswb %xmm3, %xmm4
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pmullw %xmm5, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v16i8:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,2,4,8,16,32,64,128>
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,128,64,32,16,8,4,2>
-; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $228, {{.*}}(%rip){1to2}, %xmm1, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VBMI2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: movw $257, %ax # imm = 0x101
-; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
; XOP-LABEL: constant_funnnel_v16i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_funnnel_v16i8:
@@ -2728,29 +2571,23 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
-; X32-SSE-NEXT: pmullw %xmm4, %xmm3
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
; X32-SSE-NEXT: psrlw $8, %xmm3
-; X32-SSE-NEXT: movdqa %xmm1, %xmm5
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <u,128,64,32,16,8,4,2>
-; X32-SSE-NEXT: pmullw %xmm2, %xmm5
-; X32-SSE-NEXT: psrlw $8, %xmm5
-; X32-SSE-NEXT: packuswb %xmm3, %xmm5
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE-NEXT: pmullw %xmm4, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; X32-SSE-NEXT: pand %xmm4, %xmm3
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: packuswb %xmm3, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT: pand %xmm3, %xmm2
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pmullw %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: packuswb %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <16 x i8> %res
@@ -2825,8 +2662,10 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwi
; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlq $14, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
; X32-SSE-NEXT: psllq $50, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
ret <2 x i64> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 3f808acaeb26..e6f3097e7496 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -22,120 +22,100 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpsrlq %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpsrlq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsllq $1, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [64,64]
-; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6
-; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
@@ -146,41 +126,36 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpsllq $1, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm6
-; XOPAVX1-NEXT: vpshlq %xmm6, %xmm1, %xmm6
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm3, %xmm8, %xmm7
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpshlq %xmm7, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm7
-; XOPAVX1-NEXT: vpshlq %xmm7, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorpd %ymm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcomeqq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v4i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
-; XOPAVX2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
-; XOPAVX2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
ret <4 x i64> %res
@@ -189,135 +164,116 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
-; AVX1-NEXT: vpsrld %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX1-NEXT: vpsrld %xmm7, %xmm5, %xmm7
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm6
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmulld %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpsubd %xmm2, %xmm9, %xmm6
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpmulld %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm8, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpslld $1, %xmm5, %xmm5
+; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
@@ -328,41 +284,36 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpslld $1, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm6
-; XOPAVX1-NEXT: vpshld %xmm6, %xmm1, %xmm6
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm7
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpshld %xmm7, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm8, %xmm7
-; XOPAVX1-NEXT: vpshld %xmm7, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcomeqd %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpshld %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v8i32:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
-; XOPAVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
ret <8 x i32> %res
@@ -371,179 +322,162 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpsllw $12, %xmm3, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm3, %xmm5
-; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm7
-; AVX1-NEXT: vpblendvb %xmm4, %xmm7, %xmm6, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm6
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6
-; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
-; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsllw $12, %xmm2, %xmm5
-; AVX1-NEXT: vpsllw $4, %xmm2, %xmm6
-; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6
+; AVX1-NEXT: vpsllw $4, %xmm5, %xmm5
+; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm7
-; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm7
-; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm3
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm7
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm8, %ymm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
-; AVX2-NEXT: vpsrlvd %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm6
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
+; AVX2-NEXT: vpsrlvd %ymm7, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX2-NEXT: vpsrlvd %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
+; AVX2-NEXT: vpsrlvd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpandn %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX2-NEXT: vpsllvd %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX2-NEXT: vpsllvd %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
-; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
@@ -554,51 +488,47 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; XOPAVX1-LABEL: var_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpsllw $1, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm6
-; XOPAVX1-NEXT: vpshlw %xmm6, %xmm1, %xmm6
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm3, %xmm8, %xmm7
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpshlw %xmm7, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm8, %xmm7
-; XOPAVX1-NEXT: vpshlw %xmm7, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcomeqw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm4
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
ret <16 x i16> %res
@@ -607,297 +537,274 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm6
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm3, %xmm9, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsllw $2, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
-; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
-; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpsubb %xmm2, %xmm9, %xmm6
-; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $2, %xmm0, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm5
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm3, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5
+; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm6
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6
; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm5, %xmm10, %xmm5
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpsllw $5, %xmm2, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm10, %xmm0
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm5
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm8, %ymm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm6
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm7, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $2, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX2-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
-; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm4
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm4
; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm5, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: var_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm6
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm3, %xmm8, %xmm7
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm7, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm8, %xmm7
-; XOPAVX1-NEXT: vpshlb %xmm7, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcomeqb %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqb %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
ret <32 x i8> %res
@@ -910,112 +817,98 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpsllq $1, %xmm5, %xmm5
+; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip){1to4}, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512BW-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512VBMI2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512VBMI2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip){1to4}, %ymm3, %ymm2
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
@@ -1027,38 +920,34 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpsllq $1, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm3
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; XOPAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
%res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
@@ -1068,126 +957,108 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpsrld %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpslld $1, %xmm5, %xmm5
+; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VBMI2-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
@@ -1199,42 +1070,36 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT: vpsrld %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsrld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpslld $1, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3
; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; XOPAVX2-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT: vpsrld %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
%res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
@@ -1244,125 +1109,130 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsllw $1, %xmm5, %xmm5
+; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
+; AVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm5
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX2-NEXT: vpsllvd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX2-NEXT: vpsllvd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm3
+; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
@@ -1374,42 +1244,40 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpsllw $1, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpshlw %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; XOPAVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm4
+; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm5, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
@@ -1419,228 +1287,243 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v32i8:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpsllw $4, %xmm5, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsllw $2, %xmm5, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm6
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm5
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpsllw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpsllw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v32i8:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm4
+; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm4
-; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpternlogq $236, %ymm4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT: vpandn {{.*}}(%rip), %ymm2, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm0, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm3, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpandn {{.*}}(%rip), %ymm3, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vandnps %ymm4, %ymm2, %ymm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; XOPAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; XOPAVX1-NEXT: vpshlb %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm4
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm3
-; XOPAVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
%res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
@@ -1844,86 +1727,78 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
; AVX1-LABEL: constant_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,128,64,32,16,8,4,2]
-; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
-; AVX1-NEXT: vpmulhuw %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm2
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1>
-; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1>
-; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512VBMI2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512VBMI2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VLBW-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
@@ -1935,25 +1810,26 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; XOPAVX1-LABEL: constant_funnnel_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm3
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; XOPAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpcmov {{.*}}(%rip), %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; XOPAVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; XOPAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; XOPAVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
ret <16 x i16> %res
@@ -1976,42 +1852,41 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,2,4,8,16,32,64,128>
-; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <u,128,64,32,16,8,4,2>
-; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,1,2,4,8,16,32,64]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,64,32,16,8,4,2,1]
+; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v32i8:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
@@ -2025,20 +1900,19 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_funnnel_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
@@ -2052,20 +1926,19 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
@@ -2079,96 +1952,88 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $228, {{.*}}(%rip){1to4}, %ymm1, %ymm0
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VBMI2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: movl $16843009, %eax # imm = 0x1010101
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: movl $16843009, %eax # imm = 0x1010101
-; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: constant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,255,254,253,252,251,250,249,u,249,250,251,252,253,254,255>
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_funnnel_v32i8:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,255,254,253,252,251,250,249,u,249,250,251,252,253,254,255>
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm3
-; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 9aa74f165bdd..0c001a8a8c0c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -18,38 +18,35 @@ declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
-; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
-; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i64:
@@ -60,14 +57,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
@@ -82,38 +78,35 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i32:
@@ -124,14 +117,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
@@ -146,84 +138,77 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm5, %zmm6, %zmm5
+; AVX512F-NEXT: vpmovdw %zmm5, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm6, %zmm7, %zmm6
-; AVX512F-NEXT: vpmovdw %zmm6, %ymm6
-; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm4
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4
-; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm6, %zmm5
+; AVX512VL-NEXT: vpmovdw %zmm5, %ymm5
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm4, %ymm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4
-; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm6, %zmm7, %zmm6
-; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6
-; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm4
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4
-; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i16:
@@ -234,14 +219,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
@@ -256,294 +240,284 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpandq %zmm6, %zmm2, %zmm7
+; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm8
; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4
; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm10, %ymm4
; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm7, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm10, %ymm4
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpandnq %zmm6, %zmm2, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
-; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5
-; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm4
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3
-; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpandq %zmm6, %zmm2, %zmm7
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm7, %ymm8
; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpand %ymm4, %ymm9, %ymm4
; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpand %ymm4, %ymm10, %ymm4
; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
-; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm9, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm10, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpandnq %zmm6, %zmm2, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm6
; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
-; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5
-; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm4
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
-; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm4
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k2}
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
-; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
-; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k2}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
-; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpandnq %zmm3, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm3
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
ret <64 x i8> %res
@@ -556,43 +530,35 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
@@ -604,15 +570,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm3
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip){1to8}, %zmm3, %zmm2
+; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
@@ -629,49 +593,41 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
@@ -683,17 +639,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
@@ -710,63 +664,68 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm4
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm4
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm3
+; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
@@ -778,17 +737,14 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm3
+; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
@@ -805,154 +761,210 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512F-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm6
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
-; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpternlogq $236, %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm6, %ymm7
+; AVX512F-NEXT: vpsllw $5, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm0, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4
-; AVX512VL-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm6
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT: vpternlogq $236, %zmm4, %zmm0, %zmm2
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm6, %ymm7
+; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm0, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm0, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0
+; AVX512VL-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw %xmm0, %xmm5, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsllw $4, %zmm3, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm4
+; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm4, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm2
-; AVX512VBMI2-NEXT: vpsrlw %xmm0, %xmm5, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm0
+; AVX512VBMI2-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm4, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm3, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm0, %zmm1, %zmm2
-; AVX512VLBW-NEXT: vpsrlw %xmm0, %xmm5, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpsllw $4, %zmm3, %zmm0
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm0
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm0
+; AVX512VLBW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm0, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm4, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm3, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm3, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm2
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %xmm5, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpandnq {{.*}}(%rip), %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %xmm2, %xmm0
; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm4, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm3, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm1, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
@@ -1056,44 +1068,50 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpsllw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm2
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpsllw $1, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1]
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512VL-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
+; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
@@ -1104,12 +1122,10 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
;
; AVX512VLBW-LABEL: constant_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VLBW-NEXT: movl $65537, %eax # imm = 0x10001
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
@@ -1125,10 +1141,11 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
@@ -1139,6 +1156,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
@@ -1164,22 +1182,22 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq $228, {{.*}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
@@ -1190,6 +1208,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
@@ -1215,20 +1234,20 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
-; AVX512VL-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpternlogq $228, {{.*}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
@@ -1244,21 +1263,19 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpackuswb %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
@@ -1274,21 +1291,19 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
-; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm2, %zmm2
-; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VBMI2-NEXT: kmovq %rax, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
@@ -1304,21 +1319,19 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VLBW-NEXT: kmovq %rax, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312,49376,32928,16480,32,224,16416,32864,49312]
; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
@@ -1334,14 +1347,11 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
-; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; AVX512VLVBMI2-NEXT: kmovq %rax, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index da32eaaebfa1..ecf27e5884da 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -80,7 +80,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v2i64:
@@ -92,7 +92,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v2i64:
@@ -592,7 +592,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -609,7 +609,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -626,7 +626,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
@@ -644,7 +644,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@@ -731,7 +731,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
@@ -1142,7 +1142,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1159,7 +1159,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -1176,7 +1176,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
@@ -1194,7 +1194,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@@ -1258,13 +1258,13 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm1
+; SSE2-NEXT: psllq $60, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq $14, %xmm2
+; SSE2-NEXT: psllq $50, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $60, %xmm1
-; SSE2-NEXT: psllq $50, %xmm0
+; SSE2-NEXT: psrlq $4, %xmm1
+; SSE2-NEXT: psrlq $14, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1272,32 +1272,32 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $14, %xmm1
+; SSE41-NEXT: psllq $50, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq $4, %xmm2
+; SSE41-NEXT: psllq $60, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $50, %xmm1
-; SSE41-NEXT: psllq $60, %xmm0
+; SSE41-NEXT: psrlq $14, %xmm1
+; SSE41-NEXT: psrlq $4, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm2
+; AVX1-NEXT: vpsllq $50, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $60, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
-; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
@@ -1336,16 +1336,24 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
;
; X32-SSE-LABEL: constant_funnnel_v2i64:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,0,63,0]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <4,u,14,u>
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: psubq %xmm2, %xmm3
+; X32-SSE-NEXT: pand %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: psrlq %xmm2, %xmm4
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: psrlq %xmm2, %xmm5
+; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT: pand %xmm1, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $4, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq $14, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $60, %xmm1
-; X32-SSE-NEXT: psllq $50, %xmm0
+; X32-SSE-NEXT: psllq %xmm3, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; X32-SSE-NEXT: psllq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: orpd %xmm2, %xmm0
+; X32-SSE-NEXT: orpd %xmm5, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>)
ret <2 x i64> %res
@@ -1610,8 +1618,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
-; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -1620,8 +1628,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
; AVX512VL-LABEL: constant_funnnel_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -1629,11 +1637,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
;
; AVX512BW-LABEL: constant_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1643,8 +1651,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
; AVX512VLBW-LABEL: constant_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
@@ -1690,15 +1698,15 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
; SSE-LABEL: splatconstant_funnnel_v2i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlq $14, %xmm1
-; SSE-NEXT: psllq $50, %xmm0
+; SSE-NEXT: psllq $50, %xmm1
+; SSE-NEXT: psrlq $14, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_funnnel_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlq $14, %xmm0, %xmm1
-; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
+; AVX-NEXT: vpsllq $50, %xmm0, %xmm1
+; AVX-NEXT: vpsrlq $14, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -1736,9 +1744,11 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $14, %xmm1
-; X32-SSE-NEXT: psllq $50, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psllq $50, %xmm1
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
+; X32-SSE-NEXT: psrlq $14, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT: orpd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>)
ret <2 x i64> %res
@@ -1863,33 +1873,33 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
;
; AVX512F-LABEL: splatconstant_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1
-; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index a9d70fbba5d0..5211be0fcdc4 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -48,7 +48,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v4i64:
@@ -60,7 +60,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v4i64:
@@ -506,7 +506,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -522,7 +522,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
@@ -577,7 +577,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm3
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
@@ -589,7 +589,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
@@ -622,9 +622,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
@@ -649,9 +649,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v8i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -669,9 +669,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
;
; AVX2-LABEL: splatvar_funnnel_v8i32:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
@@ -713,9 +713,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
@@ -740,10 +740,10 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -761,9 +761,9 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
@@ -776,9 +776,9 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512-LABEL: splatvar_funnnel_v16i16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
@@ -791,10 +791,10 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
@@ -820,8 +820,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX1-LABEL: splatvar_funnnel_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
@@ -848,9 +848,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX2-LABEL: splatvar_funnnel_v32i8:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
@@ -871,9 +871,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX512F-LABEL: splatvar_funnnel_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
@@ -894,9 +894,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
@@ -927,7 +927,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -944,15 +944,15 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
@@ -982,18 +982,18 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm3
+; AVX1-NEXT: vpsllq $4, %xmm1, %xmm2
+; AVX1-NEXT: vpsllq $14, %xmm1, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm4
+; AVX1-NEXT: vpsllq $50, %xmm0, %xmm3
+; AVX1-NEXT: vpsllq $60, %xmm0, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vpsllq $4, %xmm1, %xmm3
-; AVX1-NEXT: vpsllq $14, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm3
+; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpsllq $50, %xmm0, %xmm3
-; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
@@ -1001,8 +1001,8 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
;
; AVX2-LABEL: constant_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -1324,8 +1324,8 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
@@ -1333,8 +1333,8 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
; AVX512VLBW-LABEL: constant_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1367,20 +1367,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
; AVX1-LABEL: splatconstant_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $50, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm3
+; AVX1-NEXT: vpsllq $50, %xmm2, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpsllq $50, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq $50, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlq $14, %ymm0, %ymm1
-; AVX2-NEXT: vpsllq $50, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $50, %ymm0, %ymm1
+; AVX2-NEXT: vpsrlq $14, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -1581,17 +1581,17 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm1
+; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
-; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm1
+; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 0084702b7fd7..03dd5c07913f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -60,7 +60,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
@@ -91,7 +91,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
@@ -103,7 +103,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
@@ -115,7 +115,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
ret <32 x i16> %res
@@ -187,7 +187,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
@@ -255,7 +255,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
@@ -293,7 +293,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
@@ -331,7 +331,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
ret <64 x i8> %res
@@ -385,7 +385,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512F-NEXT: vpslld %xmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
@@ -409,7 +409,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VL-NEXT: vpslld %xmm1, %zmm2, %zmm1
; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
@@ -424,7 +424,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
@@ -439,7 +439,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
%splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
@@ -536,45 +536,45 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512BW-NEXT: vpsllw %xmm2, %xmm5, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm0
; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm5, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpandq %zmm2, %zmm4, %zmm2
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm0
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
-; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
@@ -619,7 +619,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
@@ -637,20 +637,20 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
@@ -704,7 +704,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
@@ -753,7 +753,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
@@ -780,7 +780,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
@@ -807,7 +807,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <64 x i8> %res
@@ -838,39 +838,39 @@ define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1
+; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm1
+; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm1
+; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm1
+; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
@@ -880,39 +880,39 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
More information about the llvm-commits
mailing list