[llvm] r229964 - [x86] Remove the old vector shuffle lowering code and its flag.
Philip Reames
listmail at philipreames.com
Fri Feb 20 10:19:23 PST 2015
Thank you for all the work on this!
Philip
On 02/19/2015 08:25 PM, Chandler Carruth wrote:
> Author: chandlerc
> Date: Thu Feb 19 22:25:04 2015
> New Revision: 229964
>
> URL: http://llvm.org/viewvc/llvm-project?rev=229964&view=rev
> Log:
> [x86] Remove the old vector shuffle lowering code and its flag.
>
> The new shuffle lowering has been the default for some time. I've
> enabled the new legality testing by default with no really blocking
> regressions. I've fuzz tested this very heavily (many millions of fuzz
> test cases have passed at this point). And this cleans up a ton of code.
> =]
>
> Thanks again to the many folks that helped with this transition. There
> was a lot of work by others that went into the new shuffle lowering to
> make it really excellent.
>
> In case you aren't using a diff algorithm that can handle this:
> X86ISelLowering.cpp: 22 insertions(+), 2940 deletions(-)
>
> Removed:
> llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=229964&r1=229963&r2=229964&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Feb 19 22:25:04 2015
> @@ -67,11 +67,6 @@ static cl::opt<bool> ExperimentalVectorW
> "rather than promotion."),
> cl::Hidden);
>
> -static cl::opt<bool> ExperimentalVectorShuffleLowering(
> - "x86-experimental-vector-shuffle-lowering", cl::init(true),
> - cl::desc("Enable an experimental vector shuffle lowering code path."),
> - cl::Hidden);
> -
> static cl::opt<int> ReciprocalEstimateRefinementSteps(
> "x86-recip-refinement-steps", cl::init(1),
> cl::desc("Specify the number of Newton-Raphson iterations applied to the "
> @@ -3613,17 +3608,6 @@ static bool isTargetShuffle(unsigned Opc
> }
>
> static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
> - SDValue V1, SelectionDAG &DAG) {
> - switch(Opc) {
> - default: llvm_unreachable("Unknown x86 shuffle node");
> - case X86ISD::MOVSHDUP:
> - case X86ISD::MOVSLDUP:
> - case X86ISD::MOVDDUP:
> - return DAG.getNode(Opc, dl, VT, V1);
> - }
> -}
> -
> -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
> SDValue V1, unsigned TargetMask,
> SelectionDAG &DAG) {
> switch(Opc) {
> @@ -3638,20 +3622,6 @@ static SDValue getTargetShuffleNode(unsi
> }
>
> static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
> - SDValue V1, SDValue V2, unsigned TargetMask,
> - SelectionDAG &DAG) {
> - switch(Opc) {
> - default: llvm_unreachable("Unknown x86 shuffle node");
> - case X86ISD::PALIGNR:
> - case X86ISD::VALIGN:
> - case X86ISD::SHUFP:
> - case X86ISD::VPERM2X128:
> - return DAG.getNode(Opc, dl, VT, V1, V2,
> - DAG.getConstant(TargetMask, MVT::i8));
> - }
> -}
> -
> -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
> SDValue V1, SDValue V2, SelectionDAG &DAG) {
> switch(Opc) {
> default: llvm_unreachable("Unknown x86 shuffle node");
> @@ -3937,176 +3907,6 @@ static bool isSequentialOrUndefInRange(A
> return true;
> }
>
> -/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
> -/// is suitable for input to PSHUFD. That is, it doesn't reference the other
> -/// operand - by default will match for first operand.
> -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
> - bool TestSecondOperand = false) {
> - if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
> - VT != MVT::v2f64 && VT != MVT::v2i64)
> - return false;
> -
> - unsigned NumElems = VT.getVectorNumElements();
> - unsigned Lo = TestSecondOperand ? NumElems : 0;
> - unsigned Hi = Lo + NumElems;
> -
> - for (unsigned i = 0; i < NumElems; ++i)
> - if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
> - return false;
> -
> - return true;
> -}
> -
> -/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
> -/// is suitable for input to PSHUFHW.
> -static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
> - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
> - return false;
> -
> - // Lower quadword copied in order or undef.
> - if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
> - return false;
> -
> - // Upper quadword shuffled.
> - for (unsigned i = 4; i != 8; ++i)
> - if (!isUndefOrInRange(Mask[i], 4, 8))
> - return false;
> -
> - if (VT == MVT::v16i16) {
> - // Lower quadword copied in order or undef.
> - if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
> - return false;
> -
> - // Upper quadword shuffled.
> - for (unsigned i = 12; i != 16; ++i)
> - if (!isUndefOrInRange(Mask[i], 12, 16))
> - return false;
> - }
> -
> - return true;
> -}
> -
> -/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
> -/// is suitable for input to PSHUFLW.
> -static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
> - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
> - return false;
> -
> - // Upper quadword copied in order.
> - if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
> - return false;
> -
> - // Lower quadword shuffled.
> - for (unsigned i = 0; i != 4; ++i)
> - if (!isUndefOrInRange(Mask[i], 0, 4))
> - return false;
> -
> - if (VT == MVT::v16i16) {
> - // Upper quadword copied in order.
> - if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
> - return false;
> -
> - // Lower quadword shuffled.
> - for (unsigned i = 8; i != 12; ++i)
> - if (!isUndefOrInRange(Mask[i], 8, 12))
> - return false;
> - }
> -
> - return true;
> -}
> -
> -/// \brief Return true if the mask specifies a shuffle of elements that is
> -/// suitable for input to intralane (palignr) or interlane (valign) vector
> -/// right-shift.
> -static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
> - unsigned NumElts = VT.getVectorNumElements();
> - unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> -
> - // Do not handle 64-bit element shuffles with palignr.
> - if (NumLaneElts == 2)
> - return false;
> -
> - for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
> - unsigned i;
> - for (i = 0; i != NumLaneElts; ++i) {
> - if (Mask[i+l] >= 0)
> - break;
> - }
> -
> - // Lane is all undef, go to next lane
> - if (i == NumLaneElts)
> - continue;
> -
> - int Start = Mask[i+l];
> -
> - // Make sure its in this lane in one of the sources
> - if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
> - !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
> - return false;
> -
> - // If not lane 0, then we must match lane 0
> - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
> - return false;
> -
> - // Correct second source to be contiguous with first source
> - if (Start >= (int)NumElts)
> - Start -= NumElts - NumLaneElts;
> -
> - // Make sure we're shifting in the right direction.
> - if (Start <= (int)(i+l))
> - return false;
> -
> - Start -= i;
> -
> - // Check the rest of the elements to see if they are consecutive.
> - for (++i; i != NumLaneElts; ++i) {
> - int Idx = Mask[i+l];
> -
> - // Make sure its in this lane
> - if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
> - !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
> - return false;
> -
> - // If not lane 0, then we must match lane 0
> - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
> - return false;
> -
> - if (Idx >= (int)NumElts)
> - Idx -= NumElts - NumLaneElts;
> -
> - if (!isUndefOrEqual(Idx, Start+i))
> - return false;
> -
> - }
> - }
> -
> - return true;
> -}
> -
> -/// \brief Return true if the node specifies a shuffle of elements that is
> -/// suitable for input to PALIGNR.
> -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
> - const X86Subtarget *Subtarget) {
> - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
> - (VT.is256BitVector() && !Subtarget->hasInt256()) ||
> - VT.is512BitVector())
> - // FIXME: Add AVX512BW.
> - return false;
> -
> - return isAlignrMask(Mask, VT, false);
> -}
> -
> -/// \brief Return true if the node specifies a shuffle of elements that is
> -/// suitable for input to VALIGN.
> -static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
> - const X86Subtarget *Subtarget) {
> - // FIXME: Add AVX512VL.
> - if (!VT.is512BitVector() || !Subtarget->hasAVX512())
> - return false;
> - return isAlignrMask(Mask, VT, true);
> -}
> -
> /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
> /// the two vector operands have swapped position.
> static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
> @@ -4122,8540 +3922,6016 @@ static void CommuteVectorShuffleMask(Sma
> }
> }
>
> -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to 128/256-bit
> -/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
> -/// reverse of what x86 shuffles want.
> -static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
> -
> - unsigned NumElems = VT.getVectorNumElements();
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElems = NumElems/NumLanes;
> -
> - if (NumLaneElems != 2 && NumLaneElems != 4)
> +/// isVEXTRACTIndex - Return true if the specified
> +/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
> +/// suitable for instruction that extract 128 or 256 bit vectors
> +static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
> + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
> + if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
> return false;
>
> - unsigned EltSize = VT.getVectorElementType().getSizeInBits();
> - bool symmetricMaskRequired =
> - (VT.getSizeInBits() >= 256) && (EltSize == 32);
> -
> - // VSHUFPSY divides the resulting vector into 4 chunks.
> - // The sources are also splitted into 4 chunks, and each destination
> - // chunk must come from a different source chunk.
> - //
> - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
> - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
> - //
> - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
> - // Y3..Y0, Y3..Y0, X3..X0, X3..X0
> - //
> - // VSHUFPDY divides the resulting vector into 4 chunks.
> - // The sources are also splitted into 4 chunks, and each destination
> - // chunk must come from a different source chunk.
> - //
> - // SRC1 => X3 X2 X1 X0
> - // SRC2 => Y3 Y2 Y1 Y0
> - //
> - // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
> - //
> - SmallVector<int, 4> MaskVal(NumLaneElems, -1);
> - unsigned HalfLaneElems = NumLaneElems/2;
> - for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
> - for (unsigned i = 0; i != NumLaneElems; ++i) {
> - int Idx = Mask[i+l];
> - unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
> - if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
> - return false;
> - // For VSHUFPSY, the mask of the second half must be the same as the
> - // first but with the appropriate offsets. This works in the same way as
> - // VPERMILPS works with masks.
> - if (!symmetricMaskRequired || Idx < 0)
> - continue;
> - if (MaskVal[i] < 0) {
> - MaskVal[i] = Idx - l;
> - continue;
> - }
> - if ((signed)(Idx - l) != MaskVal[i])
> - return false;
> - }
> - }
> + // The index should be aligned on a vecWidth-bit boundary.
> + uint64_t Index =
> + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
>
> - return true;
> + MVT VT = N->getSimpleValueType(0);
> + unsigned ElSize = VT.getVectorElementType().getSizeInBits();
> + bool Result = (Index * ElSize) % vecWidth == 0;
> +
> + return Result;
> }
>
> -/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
> -static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> +/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
> +/// operand specifies a subvector insert that is suitable for input to
> +/// insertion of 128 or 256-bit subvectors
> +static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
> + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
> + if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
> return false;
> + // The index should be aligned on a vecWidth-bit boundary.
> + uint64_t Index =
> + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
>
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - if (NumElems != 4)
> - return false;
> + MVT VT = N->getSimpleValueType(0);
> + unsigned ElSize = VT.getVectorElementType().getSizeInBits();
> + bool Result = (Index * ElSize) % vecWidth == 0;
>
> - // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
> - return isUndefOrEqual(Mask[0], 6) &&
> - isUndefOrEqual(Mask[1], 7) &&
> - isUndefOrEqual(Mask[2], 2) &&
> - isUndefOrEqual(Mask[3], 3);
> + return Result;
> }
>
> -/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
> -/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
> -/// <2, 3, 2, 3>
> -static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> -
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - if (NumElems != 4)
> - return false;
> +bool X86::isVINSERT128Index(SDNode *N) {
> + return isVINSERTIndex(N, 128);
> +}
>
> - return isUndefOrEqual(Mask[0], 2) &&
> - isUndefOrEqual(Mask[1], 3) &&
> - isUndefOrEqual(Mask[2], 2) &&
> - isUndefOrEqual(Mask[3], 3);
> +bool X86::isVINSERT256Index(SDNode *N) {
> + return isVINSERTIndex(N, 256);
> }
>
> -/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
> -static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> +bool X86::isVEXTRACT128Index(SDNode *N) {
> + return isVEXTRACTIndex(N, 128);
> +}
>
> - unsigned NumElems = VT.getVectorNumElements();
> +bool X86::isVEXTRACT256Index(SDNode *N) {
> + return isVEXTRACTIndex(N, 256);
> +}
>
> - if (NumElems != 2 && NumElems != 4)
> - return false;
> +static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
> + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
> + if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
> + llvm_unreachable("Illegal extract subvector for VEXTRACT");
>
> - for (unsigned i = 0, e = NumElems/2; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i + NumElems))
> - return false;
> + uint64_t Index =
> + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
>
> - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i))
> - return false;
> + MVT VecVT = N->getOperand(0).getSimpleValueType();
> + MVT ElVT = VecVT.getVectorElementType();
>
> - return true;
> + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
> + return Index / NumElemsPerChunk;
> }
>
> -/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
> -static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> +static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
> + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
> + if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
> + llvm_unreachable("Illegal insert subvector for VINSERT");
>
> - unsigned NumElems = VT.getVectorNumElements();
> + uint64_t Index =
> + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
>
> - if (NumElems != 2 && NumElems != 4)
> - return false;
> + MVT VecVT = N->getSimpleValueType(0);
> + MVT ElVT = VecVT.getVectorElementType();
>
> - for (unsigned i = 0, e = NumElems/2; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i))
> - return false;
> + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
> + return Index / NumElemsPerChunk;
> +}
>
> - for (unsigned i = 0, e = NumElems/2; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i + e], i + NumElems))
> - return false;
> +/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
> +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
> +/// and VINSERTI128 instructions.
> +unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
> + return getExtractVEXTRACTImmediate(N, 128);
> +}
>
> - return true;
> +/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
> +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
> +/// and VINSERTI64x4 instructions.
> +unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
> + return getExtractVEXTRACTImmediate(N, 256);
> }
>
> -/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to INSERTPS.
> -/// i. e: If all but one element come from the same vector.
> -static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
> - // TODO: Deal with AVX's VINSERTPS
> - if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
> - return false;
> +/// getInsertVINSERT128Immediate - Return the appropriate immediate
> +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
> +/// and VINSERTI128 instructions.
> +unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
> + return getInsertVINSERTImmediate(N, 128);
> +}
>
> - unsigned CorrectPosV1 = 0;
> - unsigned CorrectPosV2 = 0;
> - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
> - if (Mask[i] == -1) {
> - ++CorrectPosV1;
> - ++CorrectPosV2;
> - continue;
> - }
> +/// getInsertVINSERT256Immediate - Return the appropriate immediate
> +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
> +/// and VINSERTI64x4 instructions.
> +unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
> + return getInsertVINSERTImmediate(N, 256);
> +}
>
> - if (Mask[i] == i)
> - ++CorrectPosV1;
> - else if (Mask[i] == i + 4)
> - ++CorrectPosV2;
> - }
> +/// isZero - Returns true if Elt is a constant integer zero
> +static bool isZero(SDValue V) {
> + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
> + return C && C->isNullValue();
> +}
>
> - if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
> - // We have 3 elements (undefs count as elements from any vector) from one
> - // vector, and one from another.
> +/// isZeroNode - Returns true if Elt is a constant zero or a floating point
> +/// constant +0.0.
> +bool X86::isZeroNode(SDValue Elt) {
> + if (isZero(Elt))
> return true;
> -
> + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
> + return CFP->getValueAPF().isPosZero();
> return false;
> }
>
> -//
> -// Some special combinations that can be optimized.
> -//
> -static
> -SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
> - SelectionDAG &DAG) {
> - MVT VT = SVOp->getSimpleValueType(0);
> - SDLoc dl(SVOp);
> -
> - if (VT != MVT::v8i32 && VT != MVT::v8f32)
> - return SDValue();
> -
> - ArrayRef<int> Mask = SVOp->getMask();
> -
> - // These are the special masks that may be optimized.
> - static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
> - static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
> - bool MatchEvenMask = true;
> - bool MatchOddMask = true;
> - for (int i=0; i<8; ++i) {
> - if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
> - MatchEvenMask = false;
> - if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
> - MatchOddMask = false;
> - }
> -
> - if (!MatchEvenMask && !MatchOddMask)
> - return SDValue();
> -
> - SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
> -
> - SDValue Op0 = SVOp->getOperand(0);
> - SDValue Op1 = SVOp->getOperand(1);
> -
> - if (MatchEvenMask) {
> - // Shift the second operand right to 32 bits.
> - static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
> - Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
> - } else {
> - // Shift the first operand left to 32 bits.
> - static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
> - Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
> - }
> - static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
> - return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
> -}
> -
> -/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to UNPCKL.
> -static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
> - bool HasInt256, bool V2IsSplat = false) {
> -
> - assert(VT.getSizeInBits() >= 128 &&
> - "Unsupported vector type for unpckl");
> +/// getZeroVector - Returns a vector of specified type with all zero elements.
> +///
> +static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
> + SelectionDAG &DAG, SDLoc dl) {
> + assert(VT.isVector() && "Expected a vector type");
>
> - unsigned NumElts = VT.getVectorNumElements();
> - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
> - (!HasInt256 || (NumElts != 16 && NumElts != 32)))
> - return false;
> + // Always build SSE zero vectors as <4 x i32> bitcasted
> + // to their dest type. This ensures they get CSE'd.
> + SDValue Vec;
> + if (VT.is128BitVector()) { // SSE
> + if (Subtarget->hasSSE2()) { // SSE2
> + SDValue Cst = DAG.getConstant(0, MVT::i32);
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> + } else { // SSE1
> + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
> + }
> + } else if (VT.is256BitVector()) { // AVX
> + if (Subtarget->hasInt256()) { // AVX2
> + SDValue Cst = DAG.getConstant(0, MVT::i32);
> + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
> + } else {
> + // 256-bit logic and arithmetic instructions in AVX are all
> + // floating-point, no support for integer ops. Emit fp zeroed vectors.
> + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
> + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
> + }
> + } else if (VT.is512BitVector()) { // AVX-512
> + SDValue Cst = DAG.getConstant(0, MVT::i32);
> + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
> + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
> + } else if (VT.getScalarType() == MVT::i1) {
> + assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
> + SDValue Cst = DAG.getConstant(0, MVT::i1);
> + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> + } else
> + llvm_unreachable("Unexpected vector type");
>
> - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
> - "Unsupported vector type for unpckh");
> + return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
> +}
>
> - // AVX defines UNPCK* to operate independently on 128-bit lanes.
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> +/// getOnesVector - Returns a vector of specified type with all bits set.
> +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
> +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
> +/// Then bitcast to their original type, ensuring they get CSE'd.
> +static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
> + SDLoc dl) {
> + assert(VT.isVector() && "Expected a vector type");
>
> - for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
> - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
> - int BitI = Mask[l+i];
> - int BitI1 = Mask[l+i+1];
> - if (!isUndefOrEqual(BitI, j))
> - return false;
> - if (V2IsSplat) {
> - if (!isUndefOrEqual(BitI1, NumElts))
> - return false;
> - } else {
> - if (!isUndefOrEqual(BitI1, j + NumElts))
> - return false;
> - }
> + SDValue Cst = DAG.getConstant(~0U, MVT::i32);
> + SDValue Vec;
> + if (VT.is256BitVector()) {
> + if (HasInt256) { // AVX2
> + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
> + } else { // AVX
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> + Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
> }
> - }
> + } else if (VT.is128BitVector()) {
> + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> + } else
> + llvm_unreachable("Unexpected vector type");
>
> - return true;
> + return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
> }
>
> -/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to UNPCKH.
> -static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
> - bool HasInt256, bool V2IsSplat = false) {
> - assert(VT.getSizeInBits() >= 128 &&
> - "Unsupported vector type for unpckh");
> -
> - unsigned NumElts = VT.getVectorNumElements();
> - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
> - (!HasInt256 || (NumElts != 16 && NumElts != 32)))
> - return false;
> -
> - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
> - "Unsupported vector type for unpckh");
> +/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
> +/// operation of specified width.
> +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
> + SDValue V2) {
> + unsigned NumElems = VT.getVectorNumElements();
> + SmallVector<int, 8> Mask;
> + Mask.push_back(NumElems);
> + for (unsigned i = 1; i != NumElems; ++i)
> + Mask.push_back(i);
> + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> +}
>
> - // AVX defines UNPCK* to operate independently on 128-bit lanes.
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> +/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
> +static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
> + SDValue V2) {
> + unsigned NumElems = VT.getVectorNumElements();
> + SmallVector<int, 8> Mask;
> + for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
> + Mask.push_back(i);
> + Mask.push_back(i + NumElems);
> + }
> + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> +}
>
> - for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
> - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
> - int BitI = Mask[l+i];
> - int BitI1 = Mask[l+i+1];
> - if (!isUndefOrEqual(BitI, j))
> - return false;
> - if (V2IsSplat) {
> - if (isUndefOrEqual(BitI1, NumElts))
> - return false;
> - } else {
> - if (!isUndefOrEqual(BitI1, j+NumElts))
> - return false;
> - }
> - }
> +/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
> +static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
> + SDValue V2) {
> + unsigned NumElems = VT.getVectorNumElements();
> + SmallVector<int, 8> Mask;
> + for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
> + Mask.push_back(i + Half);
> + Mask.push_back(i + NumElems + Half);
> }
> - return true;
> + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> }
>
> -/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
> -/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
> -/// <0, 0, 1, 1>
> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
> - unsigned NumElts = VT.getVectorNumElements();
> - bool Is256BitVec = VT.is256BitVector();
> +/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
> +/// vector of zero or undef vector. This produces a shuffle where the low
> +/// element of V2 is swizzled into the zero/undef vector, landing at element
> +/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
> +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
> + bool IsZero,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + MVT VT = V2.getSimpleValueType();
> + SDValue V1 = IsZero
> + ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
> + unsigned NumElems = VT.getVectorNumElements();
> + SmallVector<int, 16> MaskVec;
> + for (unsigned i = 0; i != NumElems; ++i)
> + // If this is the insertion idx, put the low elt of V2 here.
> + MaskVec.push_back(i == Idx ? NumElems : i);
> + return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
> +}
>
> - if (VT.is512BitVector())
> - return false;
> - assert((VT.is128BitVector() || VT.is256BitVector()) &&
> - "Unsupported vector type for unpckh");
> +/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
> +/// target specific opcode. Returns true if the Mask could be calculated. Sets
> +/// IsUnary to true if only uses one source. Note that this will set IsUnary for
> +/// shuffles which use a single input multiple times, and in those cases it will
> +/// adjust the mask to only have indices within that single input.
> +static bool getTargetShuffleMask(SDNode *N, MVT VT,
> + SmallVectorImpl<int> &Mask, bool &IsUnary) {
> + unsigned NumElems = VT.getVectorNumElements();
> + SDValue ImmN;
>
> - if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
> - (!HasInt256 || (NumElts != 16 && NumElts != 32)))
> - return false;
> + IsUnary = false;
> + bool IsFakeUnary = false;
> + switch(N->getOpcode()) {
> + case X86ISD::BLENDI:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + break;
> + case X86ISD::SHUFP:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> + break;
> + case X86ISD::UNPCKH:
> + DecodeUNPCKHMask(VT, Mask);
> + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> + break;
> + case X86ISD::UNPCKL:
> + DecodeUNPCKLMask(VT, Mask);
> + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> + break;
> + case X86ISD::MOVHLPS:
> + DecodeMOVHLPSMask(NumElems, Mask);
> + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> + break;
> + case X86ISD::MOVLHPS:
> + DecodeMOVLHPSMask(NumElems, Mask);
> + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> + break;
> + case X86ISD::PALIGNR:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + break;
> + case X86ISD::PSHUFD:
> + case X86ISD::VPERMILPI:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::PSHUFHW:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::PSHUFLW:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::PSHUFB: {
> + IsUnary = true;
> + SDValue MaskNode = N->getOperand(1);
> + while (MaskNode->getOpcode() == ISD::BITCAST)
> + MaskNode = MaskNode->getOperand(0);
>
> - // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
> - // FIXME: Need a better way to get rid of this, there's no latency difference
> - // between UNPCKLPD and MOVDDUP, the later should always be checked first and
> - // the former later. We should also remove the "_undef" special mask.
> - if (NumElts == 4 && Is256BitVec)
> - return false;
> + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
> + // If we have a build-vector, then things are easy.
> + EVT VT = MaskNode.getValueType();
> + assert(VT.isVector() &&
> + "Can't produce a non-vector with a build_vector!");
> + if (!VT.isInteger())
> + return false;
>
> - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
> - // independently on 128-bit lanes.
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
>
> - for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
> - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
> - int BitI = Mask[l+i];
> - int BitI1 = Mask[l+i+1];
> + SmallVector<uint64_t, 32> RawMask;
> + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
> + SDValue Op = MaskNode->getOperand(i);
> + if (Op->getOpcode() == ISD::UNDEF) {
> + RawMask.push_back((uint64_t)SM_SentinelUndef);
> + continue;
> + }
> + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
> + if (!CN)
> + return false;
> + APInt MaskElement = CN->getAPIntValue();
>
> - if (!isUndefOrEqual(BitI, j))
> - return false;
> - if (!isUndefOrEqual(BitI1, j))
> - return false;
> + // We now have to decode the element which could be any integer size and
> + // extract each byte of it.
> + for (int j = 0; j < NumBytesPerElement; ++j) {
> + // Note that this is x86 and so always little endian: the low byte is
> + // the first byte of the mask.
> + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
> + MaskElement = MaskElement.lshr(8);
> + }
> + }
> + DecodePSHUFBMask(RawMask, Mask);
> + break;
> }
> - }
>
> - return true;
> -}
> + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
> + if (!MaskLoad)
> + return false;
>
> -/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
> -/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
> -/// <2, 2, 3, 3>
> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
> - unsigned NumElts = VT.getVectorNumElements();
> + SDValue Ptr = MaskLoad->getBasePtr();
> + if (Ptr->getOpcode() == X86ISD::Wrapper)
> + Ptr = Ptr->getOperand(0);
>
> - if (VT.is512BitVector())
> - return false;
> + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
> + if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
> + return false;
>
> - assert((VT.is128BitVector() || VT.is256BitVector()) &&
> - "Unsupported vector type for unpckh");
> + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
> + DecodePSHUFBMask(C, Mask);
> + if (Mask.empty())
> + return false;
> + break;
> + }
>
> - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
> - (!HasInt256 || (NumElts != 16 && NumElts != 32)))
> return false;
> -
> - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
> - // independently on 128-bit lanes.
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> -
> - for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
> - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
> - int BitI = Mask[l+i];
> - int BitI1 = Mask[l+i+1];
> - if (!isUndefOrEqual(BitI, j))
> - return false;
> - if (!isUndefOrEqual(BitI1, j))
> - return false;
> - }
> }
> - return true;
> -}
> -
> -// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
> -// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
> -static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
> - if (!VT.is512BitVector())
> + case X86ISD::VPERMI:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::MOVSS:
> + case X86ISD::MOVSD:
> + DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
> + break;
> + case X86ISD::VPERM2X128:
> + ImmN = N->getOperand(N->getNumOperands()-1);
> + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> + if (Mask.empty()) return false;
> + break;
> + case X86ISD::MOVSLDUP:
> + DecodeMOVSLDUPMask(VT, Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::MOVSHDUP:
> + DecodeMOVSHDUPMask(VT, Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::MOVDDUP:
> + DecodeMOVDDUPMask(VT, Mask);
> + IsUnary = true;
> + break;
> + case X86ISD::MOVLHPD:
> + case X86ISD::MOVLPD:
> + case X86ISD::MOVLPS:
> + // Not yet implemented
> return false;
> -
> - unsigned NumElts = VT.getVectorNumElements();
> - unsigned HalfSize = NumElts/2;
> - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
> - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
> - *Imm = 1;
> - return true;
> - }
> - }
> - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
> - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
> - *Imm = 0;
> - return true;
> - }
> + default: llvm_unreachable("unknown target shuffle node");
> }
> - return false;
> -}
> -
> -/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVSS,
> -/// MOVSD, and MOVD, i.e. setting the lowest element.
> -static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
> - if (VT.getVectorElementType().getSizeInBits() < 32)
> - return false;
> - if (!VT.is128BitVector())
> - return false;
> -
> - unsigned NumElts = VT.getVectorNumElements();
> -
> - if (!isUndefOrEqual(Mask[0], NumElts))
> - return false;
>
> - for (unsigned i = 1; i != NumElts; ++i)
> - if (!isUndefOrEqual(Mask[i], i))
> - return false;
> + // If we have a fake unary shuffle, the shuffle mask is spread across two
> + // inputs that are actually the same node. Re-map the mask to always point
> + // into the first input.
> + if (IsFakeUnary)
> + for (int &M : Mask)
> + if (M >= (int)Mask.size())
> + M -= Mask.size();
>
> return true;
> }
>
> -/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
> -/// as permutations between 128-bit chunks or halves. As an example: this
> -/// shuffle bellow:
> -/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
> -/// The first half comes from the second half of V1 and the second half from the
> -/// the second half of V2.
> -static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
> - if (!HasFp256 || !VT.is256BitVector())
> - return false;
> +/// getShuffleScalarElt - Returns the scalar element that will make up the ith
> +/// element of the result of the vector shuffle.
> +static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
> + unsigned Depth) {
> + if (Depth == 6)
> + return SDValue(); // Limit search depth.
>
> - // The shuffle result is divided into half A and half B. In total the two
> - // sources have 4 halves, namely: C, D, E, F. The final values of A and
> - // B must come from C, D, E or F.
> - unsigned HalfSize = VT.getVectorNumElements()/2;
> - bool MatchA = false, MatchB = false;
> -
> - // Check if A comes from one of C, D, E, F.
> - for (unsigned Half = 0; Half != 4; ++Half) {
> - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
> - MatchA = true;
> - break;
> - }
> - }
> + SDValue V = SDValue(N, 0);
> + EVT VT = V.getValueType();
> + unsigned Opcode = V.getOpcode();
>
> - // Check if B comes from one of C, D, E, F.
> - for (unsigned Half = 0; Half != 4; ++Half) {
> - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
> - MatchB = true;
> - break;
> - }
> + // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
> + if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
> + int Elt = SV->getMaskElt(Index);
> +
> + if (Elt < 0)
> + return DAG.getUNDEF(VT.getVectorElementType());
> +
> + unsigned NumElems = VT.getVectorNumElements();
> + SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
> + : SV->getOperand(1);
> + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
> }
>
> - return MatchA && MatchB;
> -}
> + // Recurse into target specific vector shuffles to find scalars.
> + if (isTargetShuffle(Opcode)) {
> + MVT ShufVT = V.getSimpleValueType();
> + unsigned NumElems = ShufVT.getVectorNumElements();
> + SmallVector<int, 16> ShuffleMask;
> + bool IsUnary;
>
> -/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
> -/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
> -static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
> - MVT VT = SVOp->getSimpleValueType(0);
> + if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
> + return SDValue();
>
> - unsigned HalfSize = VT.getVectorNumElements()/2;
> + int Elt = ShuffleMask[Index];
> + if (Elt < 0)
> + return DAG.getUNDEF(ShufVT.getVectorElementType());
>
> - unsigned FstHalf = 0, SndHalf = 0;
> - for (unsigned i = 0; i < HalfSize; ++i) {
> - if (SVOp->getMaskElt(i) > 0) {
> - FstHalf = SVOp->getMaskElt(i)/HalfSize;
> - break;
> - }
> + SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
> + : N->getOperand(1);
> + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
> + Depth+1);
> }
> - for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
> - if (SVOp->getMaskElt(i) > 0) {
> - SndHalf = SVOp->getMaskElt(i)/HalfSize;
> - break;
> - }
> +
> + // Actual nodes that may contain scalar elements
> + if (Opcode == ISD::BITCAST) {
> + V = V.getOperand(0);
> + EVT SrcVT = V.getValueType();
> + unsigned NumElems = VT.getVectorNumElements();
> +
> + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
> + return SDValue();
> }
>
> - return (FstHalf | (SndHalf << 4));
> + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
> + return (Index == 0) ? V.getOperand(0)
> + : DAG.getUNDEF(VT.getVectorElementType());
> +
> + if (V.getOpcode() == ISD::BUILD_VECTOR)
> + return V.getOperand(Index);
> +
> + return SDValue();
> }
>
> -// Symmetric in-lane mask. Each lane has 4 elements (for imm8)
> -static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
> - unsigned EltSize = VT.getVectorElementType().getSizeInBits();
> - if (EltSize < 32)
> - return false;
> +/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
> +///
> +static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
> + unsigned NumNonZero, unsigned NumZero,
> + SelectionDAG &DAG,
> + const X86Subtarget* Subtarget,
> + const TargetLowering &TLI) {
> + if (NumNonZero > 8)
> + return SDValue();
>
> - unsigned NumElts = VT.getVectorNumElements();
> - Imm8 = 0;
> - if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
> - for (unsigned i = 0; i != NumElts; ++i) {
> - if (Mask[i] < 0)
> - continue;
> - Imm8 |= Mask[i] << (i*2);
> + SDLoc dl(Op);
> + SDValue V;
> + bool First = true;
> + for (unsigned i = 0; i < 16; ++i) {
> + bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
> + if (ThisIsNonZero && First) {
> + if (NumZero)
> + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
> + else
> + V = DAG.getUNDEF(MVT::v8i16);
> + First = false;
> }
> - return true;
> - }
>
> - unsigned LaneSize = 4;
> - SmallVector<int, 4> MaskVal(LaneSize, -1);
> -
> - for (unsigned l = 0; l != NumElts; l += LaneSize) {
> - for (unsigned i = 0; i != LaneSize; ++i) {
> - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
> - return false;
> - if (Mask[i+l] < 0)
> - continue;
> - if (MaskVal[i] < 0) {
> - MaskVal[i] = Mask[i+l] - l;
> - Imm8 |= MaskVal[i] << (i*2);
> - continue;
> + if ((i & 1) != 0) {
> + SDValue ThisElt, LastElt;
> + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
> + if (LastIsNonZero) {
> + LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
> + MVT::i16, Op.getOperand(i-1));
> }
> - if (Mask[i+l] != (signed)(MaskVal[i]+l))
> - return false;
> + if (ThisIsNonZero) {
> + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
> + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
> + ThisElt, DAG.getConstant(8, MVT::i8));
> + if (LastIsNonZero)
> + ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
> + } else
> + ThisElt = LastElt;
> +
> + if (ThisElt.getNode())
> + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
> + DAG.getIntPtrConstant(i/2));
> }
> }
> - return true;
> -}
>
> -/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
> -/// Note that VPERMIL mask matching is different depending whether theunderlying
> -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
> -/// to the same elements of the low, but to the higher half of the source.
> -/// In VPERMILPD the two lanes could be shuffled independently of each other
> -/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
> -static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
> - unsigned EltSize = VT.getVectorElementType().getSizeInBits();
> - if (VT.getSizeInBits() < 256 || EltSize < 32)
> - return false;
> - bool symmetricMaskRequired = (EltSize == 32);
> - unsigned NumElts = VT.getVectorNumElements();
> + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
> +}
>
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned LaneSize = NumElts/NumLanes;
> - // 2 or 4 elements in one lane
> +/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
> +///
> +static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
> + unsigned NumNonZero, unsigned NumZero,
> + SelectionDAG &DAG,
> + const X86Subtarget* Subtarget,
> + const TargetLowering &TLI) {
> + if (NumNonZero > 4)
> + return SDValue();
>
> - SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
> - for (unsigned l = 0; l != NumElts; l += LaneSize) {
> - for (unsigned i = 0; i != LaneSize; ++i) {
> - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
> - return false;
> - if (symmetricMaskRequired) {
> - if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
> - ExpectedMaskVal[i] = Mask[i+l] - l;
> - continue;
> - }
> - if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
> - return false;
> + SDLoc dl(Op);
> + SDValue V;
> + bool First = true;
> + for (unsigned i = 0; i < 8; ++i) {
> + bool isNonZero = (NonZeros & (1 << i)) != 0;
> + if (isNonZero) {
> + if (First) {
> + if (NumZero)
> + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
> + else
> + V = DAG.getUNDEF(MVT::v8i16);
> + First = false;
> }
> + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
> + MVT::v8i16, V, Op.getOperand(i),
> + DAG.getIntPtrConstant(i));
> }
> }
> - return true;
> -}
>
> -/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
> -/// of what x86 movss want. X86 movs requires the lowest element to be lowest
> -/// element of vector 2 and the other elements to come from vector 1 in order.
> -static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
> - bool V2IsSplat = false, bool V2IsUndef = false) {
> - if (!VT.is128BitVector())
> - return false;
> + return V;
> +}
>
> - unsigned NumOps = VT.getVectorNumElements();
> - if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
> - return false;
> +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
> +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
> + const X86Subtarget *Subtarget,
> + const TargetLowering &TLI) {
> + // Find all zeroable elements.
> + std::bitset<4> Zeroable;
> + for (int i=0; i < 4; ++i) {
> + SDValue Elt = Op->getOperand(i);
> + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
> + }
> + assert(Zeroable.size() - Zeroable.count() > 1 &&
> + "We expect at least two non-zero elements!");
>
> - if (!isUndefOrEqual(Mask[0], 0))
> - return false;
> + // We only know how to deal with build_vector nodes where elements are either
> + // zeroable or extract_vector_elt with constant index.
> + SDValue FirstNonZero;
> + unsigned FirstNonZeroIdx;
> + for (unsigned i=0; i < 4; ++i) {
> + if (Zeroable[i])
> + continue;
> + SDValue Elt = Op->getOperand(i);
> + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> + !isa<ConstantSDNode>(Elt.getOperand(1)))
> + return SDValue();
> + // Make sure that this node is extracting from a 128-bit vector.
> + MVT VT = Elt.getOperand(0).getSimpleValueType();
> + if (!VT.is128BitVector())
> + return SDValue();
> + if (!FirstNonZero.getNode()) {
> + FirstNonZero = Elt;
> + FirstNonZeroIdx = i;
> + }
> + }
>
> - for (unsigned i = 1; i != NumOps; ++i)
> - if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
> - (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
> - (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
> - return false;
> + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
> + SDValue V1 = FirstNonZero.getOperand(0);
> + MVT VT = V1.getSimpleValueType();
>
> - return true;
> -}
> -
> -/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
> -/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
> -static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
> - const X86Subtarget *Subtarget) {
> - if (!Subtarget->hasSSE3())
> - return false;
> -
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - if ((VT.is128BitVector() && NumElems != 4) ||
> - (VT.is256BitVector() && NumElems != 8) ||
> - (VT.is512BitVector() && NumElems != 16))
> - return false;
> -
> - // "i+1" is the value the indexed mask element must have
> - for (unsigned i = 0; i != NumElems; i += 2)
> - if (!isUndefOrEqual(Mask[i], i+1) ||
> - !isUndefOrEqual(Mask[i+1], i+1))
> - return false;
> -
> - return true;
> -}
> -
> -/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
> -/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
> -static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
> - const X86Subtarget *Subtarget) {
> - if (!Subtarget->hasSSE3())
> - return false;
> -
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - if ((VT.is128BitVector() && NumElems != 4) ||
> - (VT.is256BitVector() && NumElems != 8) ||
> - (VT.is512BitVector() && NumElems != 16))
> - return false;
> -
> - // "i" is the value the indexed mask element must have
> - for (unsigned i = 0; i != NumElems; i += 2)
> - if (!isUndefOrEqual(Mask[i], i) ||
> - !isUndefOrEqual(Mask[i+1], i))
> - return false;
> + // See if this build_vector can be lowered as a blend with zero.
> + SDValue Elt;
> + unsigned EltMaskIdx, EltIdx;
> + int Mask[4];
> + for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
> + if (Zeroable[EltIdx]) {
> + // The zero vector will be on the right hand side.
> + Mask[EltIdx] = EltIdx+4;
> + continue;
> + }
>
> - return true;
> -}
> + Elt = Op->getOperand(EltIdx);
> + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
> + EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
> + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
> + break;
> + Mask[EltIdx] = EltIdx;
> + }
>
> -/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to 256-bit
> -/// version of MOVDDUP.
> -static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
> - if (!HasFp256 || !VT.is256BitVector())
> - return false;
> + if (EltIdx == 4) {
> + // Let the shuffle legalizer deal with blend operations.
> + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
> + if (V1.getSimpleValueType() != VT)
> + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
> + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
> + }
>
> - unsigned NumElts = VT.getVectorNumElements();
> - if (NumElts != 4)
> - return false;
> + // See if we can lower this build_vector to a INSERTPS.
> + if (!Subtarget->hasSSE41())
> + return SDValue();
>
> - for (unsigned i = 0; i != NumElts/2; ++i)
> - if (!isUndefOrEqual(Mask[i], 0))
> - return false;
> - for (unsigned i = NumElts/2; i != NumElts; ++i)
> - if (!isUndefOrEqual(Mask[i], NumElts/2))
> - return false;
> - return true;
> -}
> + SDValue V2 = Elt.getOperand(0);
> + if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
> + V1 = SDValue();
>
> -/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
> -/// specifies a shuffle of elements that is suitable for input to 128-bit
> -/// version of MOVDDUP.
> -static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> + bool CanFold = true;
> + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
> + if (Zeroable[i])
> + continue;
>
> - unsigned e = VT.getVectorNumElements() / 2;
> - for (unsigned i = 0; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i))
> - return false;
> - for (unsigned i = 0; i != e; ++i)
> - if (!isUndefOrEqual(Mask[e+i], i))
> - return false;
> - return true;
> -}
> + SDValue Current = Op->getOperand(i);
> + SDValue SrcVector = Current->getOperand(0);
> + if (!V1.getNode())
> + V1 = SrcVector;
> + CanFold = SrcVector == V1 &&
> + cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
> + }
>
> -/// isVEXTRACTIndex - Return true if the specified
> -/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
> -/// suitable for instruction that extract 128 or 256 bit vectors
> -static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
> - assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
> - if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
> - return false;
> + if (!CanFold)
> + return SDValue();
>
> - // The index should be aligned on a vecWidth-bit boundary.
> - uint64_t Index =
> - cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
> + assert(V1.getNode() && "Expected at least two non-zero elements!");
> + if (V1.getSimpleValueType() != MVT::v4f32)
> + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
> + if (V2.getSimpleValueType() != MVT::v4f32)
> + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
>
> - MVT VT = N->getSimpleValueType(0);
> - unsigned ElSize = VT.getVectorElementType().getSizeInBits();
> - bool Result = (Index * ElSize) % vecWidth == 0;
> + // Ok, we can emit an INSERTPS instruction.
> + unsigned ZMask = Zeroable.to_ulong();
>
> - return Result;
> + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
> + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
> + SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
> + DAG.getIntPtrConstant(InsertPSMask));
> + return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
> }
>
> -/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
> -/// operand specifies a subvector insert that is suitable for input to
> -/// insertion of 128 or 256-bit subvectors
> -static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
> - assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
> - if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
> - return false;
> - // The index should be aligned on a vecWidth-bit boundary.
> - uint64_t Index =
> - cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
> -
> - MVT VT = N->getSimpleValueType(0);
> - unsigned ElSize = VT.getVectorElementType().getSizeInBits();
> - bool Result = (Index * ElSize) % vecWidth == 0;
> -
> - return Result;
> +/// Return a vector logical shift node.
> +static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
> + unsigned NumBits, SelectionDAG &DAG,
> + const TargetLowering &TLI, SDLoc dl) {
> + assert(VT.is128BitVector() && "Unknown type for VShift");
> + MVT ShVT = MVT::v2i64;
> + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
> + SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
> + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
> + assert(NumBits % 8 == 0 && "Only support byte sized shifts");
> + SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
> + return DAG.getNode(ISD::BITCAST, dl, VT,
> + DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
> }
>
> -bool X86::isVINSERT128Index(SDNode *N) {
> - return isVINSERTIndex(N, 128);
> -}
> +static SDValue
> +LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
>
> -bool X86::isVINSERT256Index(SDNode *N) {
> - return isVINSERTIndex(N, 256);
> -}
> + // Check if the scalar load can be widened into a vector load. And if
> + // the address is "base + cst" see if the cst can be "absorbed" into
> + // the shuffle mask.
> + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
> + SDValue Ptr = LD->getBasePtr();
> + if (!ISD::isNormalLoad(LD) || LD->isVolatile())
> + return SDValue();
> + EVT PVT = LD->getValueType(0);
> + if (PVT != MVT::i32 && PVT != MVT::f32)
> + return SDValue();
>
> -bool X86::isVEXTRACT128Index(SDNode *N) {
> - return isVEXTRACTIndex(N, 128);
> -}
> + int FI = -1;
> + int64_t Offset = 0;
> + if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
> + FI = FINode->getIndex();
> + Offset = 0;
> + } else if (DAG.isBaseWithConstantOffset(Ptr) &&
> + isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
> + FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
> + Offset = Ptr.getConstantOperandVal(1);
> + Ptr = Ptr.getOperand(0);
> + } else {
> + return SDValue();
> + }
>
> -bool X86::isVEXTRACT256Index(SDNode *N) {
> - return isVEXTRACTIndex(N, 256);
> -}
> + // FIXME: 256-bit vector instructions don't require a strict alignment,
> + // improve this code to support it better.
> + unsigned RequiredAlign = VT.getSizeInBits()/8;
> + SDValue Chain = LD->getChain();
> + // Make sure the stack object alignment is at least 16 or 32.
> + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
> + if (MFI->isFixedObjectIndex(FI)) {
> + // Can't change the alignment. FIXME: It's possible to compute
> + // the exact stack offset and reference FI + adjust offset instead.
> + // If someone *really* cares about this. That's the way to implement it.
> + return SDValue();
> + } else {
> + MFI->setObjectAlignment(FI, RequiredAlign);
> + }
> + }
>
> -/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
>
> -/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
> -/// Handles 128-bit and 256-bit.
> -static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
> - MVT VT = N->getSimpleValueType(0);
> + // (Offset % 16 or 32) must be multiple of 4. Then address is then
> + // Ptr + (Offset & ~15).
> + if (Offset < 0)
> + return SDValue();
> + if ((Offset % RequiredAlign) & 3)
> + return SDValue();
> + int64_t StartOffset = Offset & ~(RequiredAlign-1);
> + if (StartOffset)
> + Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
> + Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
>
> - assert((VT.getSizeInBits() >= 128) &&
> - "Unsupported vector type for PSHUF/SHUFP");
> + int EltNo = (Offset - StartOffset) >> 2;
> + unsigned NumElems = VT.getVectorNumElements();
>
> - // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
> - // independently on 128-bit lanes.
> - unsigned NumElts = VT.getVectorNumElements();
> - unsigned NumLanes = VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
> + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
> + LD->getPointerInfo().getWithOffset(StartOffset),
> + false, false, false, 0);
>
> - assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
> - "Only supports 2, 4 or 8 elements per lane");
> + SmallVector<int, 8> Mask(NumElems, EltNo);
>
> - unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
> - unsigned Mask = 0;
> - for (unsigned i = 0; i != NumElts; ++i) {
> - int Elt = N->getMaskElt(i);
> - if (Elt < 0) continue;
> - Elt &= NumLaneElts - 1;
> - unsigned ShAmt = (i << Shift) % 8;
> - Mask |= Elt << ShAmt;
> + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
> }
>
> - return Mask;
> + return SDValue();
> }
>
> -/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
> -/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
> -static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
> - MVT VT = N->getSimpleValueType(0);
> -
> - assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
> - "Unsupported vector type for PSHUFHW");
> +/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
> +/// elements can be replaced by a single large load which has the same value as
> +/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
> +///
> +/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
> +///
> +/// FIXME: we'd also like to handle the case where the last elements are zero
> +/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
> +/// There's even a handy isZeroNode for that purpose.
> +static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
> + SDLoc &DL, SelectionDAG &DAG,
> + bool isAfterLegalize) {
> + unsigned NumElems = Elts.size();
>
> - unsigned NumElts = VT.getVectorNumElements();
> + LoadSDNode *LDBase = nullptr;
> + unsigned LastLoadedElt = -1U;
>
> - unsigned Mask = 0;
> - for (unsigned l = 0; l != NumElts; l += 8) {
> - // 8 nodes per lane, but we only care about the last 4.
> - for (unsigned i = 0; i < 4; ++i) {
> - int Elt = N->getMaskElt(l+i+4);
> - if (Elt < 0) continue;
> - Elt &= 0x3; // only 2-bits.
> - Mask |= Elt << (i * 2);
> + // For each element in the initializer, see if we've found a load or an undef.
> + // If we don't find an initial load element, or later load elements are
> + // non-consecutive, bail out.
> + for (unsigned i = 0; i < NumElems; ++i) {
> + SDValue Elt = Elts[i];
> + // Look through a bitcast.
> + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
> + Elt = Elt.getOperand(0);
> + if (!Elt.getNode() ||
> + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
> + return SDValue();
> + if (!LDBase) {
> + if (Elt.getNode()->getOpcode() == ISD::UNDEF)
> + return SDValue();
> + LDBase = cast<LoadSDNode>(Elt.getNode());
> + LastLoadedElt = i;
> + continue;
> }
> - }
> -
> - return Mask;
> -}
> -
> -/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
> -/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
> -static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
> - MVT VT = N->getSimpleValueType(0);
> -
> - assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
> - "Unsupported vector type for PSHUFHW");
> + if (Elt.getOpcode() == ISD::UNDEF)
> + continue;
>
> - unsigned NumElts = VT.getVectorNumElements();
> -
> - unsigned Mask = 0;
> - for (unsigned l = 0; l != NumElts; l += 8) {
> - // 8 nodes per lane, but we only care about the first 4.
> - for (unsigned i = 0; i < 4; ++i) {
> - int Elt = N->getMaskElt(l+i);
> - if (Elt < 0) continue;
> - Elt &= 0x3; // only 2-bits
> - Mask |= Elt << (i * 2);
> - }
> + LoadSDNode *LD = cast<LoadSDNode>(Elt);
> + EVT LdVT = Elt.getValueType();
> + // Each loaded element must be the correct fractional portion of the
> + // requested vector load.
> + if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
> + return SDValue();
> + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
> + return SDValue();
> + LastLoadedElt = i;
> }
>
> - return Mask;
> -}
> + // If we have found an entire vector of loads and undefs, then return a large
> + // load of the entire vector width starting at the base pointer. If we found
> + // consecutive loads for the low half, generate a vzext_load node.
> + if (LastLoadedElt == NumElems - 1) {
> + assert(LDBase && "Did not find base load for merging consecutive loads");
> + EVT EltVT = LDBase->getValueType(0);
> + // Ensure that the input vector size for the merged loads matches the
> + // cumulative size of the input elements.
> + if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
> + return SDValue();
>
> -/// \brief Return the appropriate immediate to shuffle the specified
> -/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
> -/// VALIGN (if Interlane is true) instructions.
> -static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
> - bool InterLane) {
> - MVT VT = SVOp->getSimpleValueType(0);
> - unsigned EltSize = InterLane ? 1 :
> - VT.getVectorElementType().getSizeInBits() >> 3;
> + if (isAfterLegalize &&
> + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
> + return SDValue();
>
> - unsigned NumElts = VT.getVectorNumElements();
> - unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
> - unsigned NumLaneElts = NumElts/NumLanes;
> + SDValue NewLd = SDValue();
>
> - int Val = 0;
> - unsigned i;
> - for (i = 0; i != NumElts; ++i) {
> - Val = SVOp->getMaskElt(i);
> - if (Val >= 0)
> - break;
> - }
> - if (Val >= (int)NumElts)
> - Val -= NumElts - NumLaneElts;
> + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
> + LDBase->getPointerInfo(), LDBase->isVolatile(),
> + LDBase->isNonTemporal(), LDBase->isInvariant(),
> + LDBase->getAlignment());
>
> - assert(Val - i > 0 && "PALIGNR imm should be positive");
> - return (Val - i) * EltSize;
> -}
> + if (LDBase->hasAnyUseOfValue(1)) {
> + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
> + SDValue(LDBase, 1),
> + SDValue(NewLd.getNode(), 1));
> + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
> + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
> + SDValue(NewLd.getNode(), 1));
> + }
>
> -/// \brief Return the appropriate immediate to shuffle the specified
> -/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
> -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
> - return getShuffleAlignrImmediate(SVOp, false);
> -}
> + return NewLd;
> + }
>
> -/// \brief Return the appropriate immediate to shuffle the specified
> -/// VECTOR_SHUFFLE mask with the VALIGN instruction.
> -static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
> - return getShuffleAlignrImmediate(SVOp, true);
> -}
> + //TODO: The code below fires only for for loading the low v2i32 / v2f32
> + //of a v4i32 / v4f32. It's probably worth generalizing.
> + EVT EltVT = VT.getVectorElementType();
> + if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
> + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
> + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
> + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
> + SDValue ResNode =
> + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
> + LDBase->getPointerInfo(),
> + LDBase->getAlignment(),
> + false/*isVolatile*/, true/*ReadMem*/,
> + false/*WriteMem*/);
>
> + // Make sure the newly-created LOAD is in the same position as LDBase in
> + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
> + // update uses of LDBase's output chain to use the TokenFactor.
> + if (LDBase->hasAnyUseOfValue(1)) {
> + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
> + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
> + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
> + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
> + SDValue(ResNode.getNode(), 1));
> + }
>
> -static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
> - assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
> - if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
> - llvm_unreachable("Illegal extract subvector for VEXTRACT");
> + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
> + }
> + return SDValue();
> +}
>
> - uint64_t Index =
> - cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
> +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
> +/// to generate a splat value for the following cases:
> +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
> +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
> +/// a scalar load, or a constant.
> +/// The VBROADCAST node is returned when a pattern is found,
> +/// or SDValue() otherwise.
> +static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
> + SelectionDAG &DAG) {
> + // VBROADCAST requires AVX.
> + // TODO: Splats could be generated for non-AVX CPUs using SSE
> + // instructions, but there's less potential gain for only 128-bit vectors.
> + if (!Subtarget->hasAVX())
> + return SDValue();
>
> - MVT VecVT = N->getOperand(0).getSimpleValueType();
> - MVT ElVT = VecVT.getVectorElementType();
> + MVT VT = Op.getSimpleValueType();
> + SDLoc dl(Op);
>
> - unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
> - return Index / NumElemsPerChunk;
> -}
> + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
> + "Unsupported vector type for broadcast.");
>
> -static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
> - assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
> - if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
> - llvm_unreachable("Illegal insert subvector for VINSERT");
> + SDValue Ld;
> + bool ConstSplatVal;
>
> - uint64_t Index =
> - cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
> + switch (Op.getOpcode()) {
> + default:
> + // Unknown pattern found.
> + return SDValue();
>
> - MVT VecVT = N->getSimpleValueType(0);
> - MVT ElVT = VecVT.getVectorElementType();
> + case ISD::BUILD_VECTOR: {
> + auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
> + BitVector UndefElements;
> + SDValue Splat = BVOp->getSplatValue(&UndefElements);
>
> - unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
> - return Index / NumElemsPerChunk;
> -}
> + // We need a splat of a single value to use broadcast, and it doesn't
> + // make any sense if the value is only in one element of the vector.
> + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
> + return SDValue();
>
> -/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
> -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
> -/// and VINSERTI128 instructions.
> -unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
> - return getExtractVEXTRACTImmediate(N, 128);
> -}
> + Ld = Splat;
> + ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
> + Ld.getOpcode() == ISD::ConstantFP);
>
> -/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
> -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
> -/// and VINSERTI64x4 instructions.
> -unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
> - return getExtractVEXTRACTImmediate(N, 256);
> -}
> + // Make sure that all of the users of a non-constant load are from the
> + // BUILD_VECTOR node.
> + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
> + return SDValue();
> + break;
> + }
>
> -/// getInsertVINSERT128Immediate - Return the appropriate immediate
> -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
> -/// and VINSERTI128 instructions.
> -unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
> - return getInsertVINSERTImmediate(N, 128);
> -}
> + case ISD::VECTOR_SHUFFLE: {
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
>
> -/// getInsertVINSERT256Immediate - Return the appropriate immediate
> -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
> -/// and VINSERTI64x4 instructions.
> -unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
> - return getInsertVINSERTImmediate(N, 256);
> -}
> + // Shuffles must have a splat mask where the first element is
> + // broadcasted.
> + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
> + return SDValue();
>
> -/// isZero - Returns true if Elt is a constant integer zero
> -static bool isZero(SDValue V) {
> - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
> - return C && C->isNullValue();
> -}
> + SDValue Sc = Op.getOperand(0);
> + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
> + Sc.getOpcode() != ISD::BUILD_VECTOR) {
>
> -/// isZeroNode - Returns true if Elt is a constant zero or a floating point
> -/// constant +0.0.
> -bool X86::isZeroNode(SDValue Elt) {
> - if (isZero(Elt))
> - return true;
> - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
> - return CFP->getValueAPF().isPosZero();
> - return false;
> -}
> + if (!Subtarget->hasInt256())
> + return SDValue();
>
> -/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
> -/// match movhlps. The lower half elements should come from upper half of
> -/// V1 (and in order), and the upper half elements should come from the upper
> -/// half of V2 (and in order).
> -static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> - if (VT.getVectorNumElements() != 4)
> - return false;
> - for (unsigned i = 0, e = 2; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i+2))
> - return false;
> - for (unsigned i = 2; i != 4; ++i)
> - if (!isUndefOrEqual(Mask[i], i+4))
> - return false;
> - return true;
> -}
> + // Use the register form of the broadcast instruction available on AVX2.
> + if (VT.getSizeInBits() >= 256)
> + Sc = Extract128BitVector(Sc, 0, DAG, dl);
> + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
> + }
>
> -/// isScalarLoadToVector - Returns true if the node is a scalar load that
> -/// is promoted to a vector. It also returns the LoadSDNode by reference if
> -/// required.
> -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
> - if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
> - return false;
> - N = N->getOperand(0).getNode();
> - if (!ISD::isNON_EXTLoad(N))
> - return false;
> - if (LD)
> - *LD = cast<LoadSDNode>(N);
> - return true;
> -}
> + Ld = Sc.getOperand(0);
> + ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
> + Ld.getOpcode() == ISD::ConstantFP);
>
> -// Test whether the given value is a vector value which will be legalized
> -// into a load.
> -static bool WillBeConstantPoolLoad(SDNode *N) {
> - if (N->getOpcode() != ISD::BUILD_VECTOR)
> - return false;
> + // The scalar_to_vector node and the suspected
> + // load node must have exactly one user.
> + // Constants may have multiple users.
>
> - // Check for any non-constant elements.
> - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
> - switch (N->getOperand(i).getNode()->getOpcode()) {
> - case ISD::UNDEF:
> - case ISD::ConstantFP:
> - case ISD::Constant:
> + // AVX-512 has register version of the broadcast
> + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
> + Ld.getValueType().getSizeInBits() >= 32;
> + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
> + !hasRegVer))
> + return SDValue();
> break;
> - default:
> - return false;
> }
> + }
>
> - // Vectors of all-zeros and all-ones are materialized with special
> - // instructions rather than being loaded.
> - return !ISD::isBuildVectorAllZeros(N) &&
> - !ISD::isBuildVectorAllOnes(N);
> -}
> + unsigned ScalarSize = Ld.getValueType().getSizeInBits();
> + bool IsGE256 = (VT.getSizeInBits() >= 256);
>
> -/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
> -/// match movlp{s|d}. The lower half elements should come from lower half of
> -/// V1 (and in order), and the upper half elements should come from the upper
> -/// half of V2 (and in order). And since V1 will become the source of the
> -/// MOVLP, it must be either a vector load or a scalar load to vector.
> -static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
> - ArrayRef<int> Mask, MVT VT) {
> - if (!VT.is128BitVector())
> - return false;
> + // When optimizing for size, generate up to 5 extra bytes for a broadcast
> + // instruction to save 8 or more bytes of constant pool data.
> + // TODO: If multiple splats are generated to load the same constant,
> + // it may be detrimental to overall size. There needs to be a way to detect
> + // that condition to know if this is truly a size win.
> + const Function *F = DAG.getMachineFunction().getFunction();
> + bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
>
> - if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
> - return false;
> - // Is V2 is a vector load, don't do this transformation. We will try to use
> - // load folding shufps op.
> - if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
> - return false;
> + // Handle broadcasting a single constant scalar from the constant pool
> + // into a vector.
> + // On Sandybridge (no AVX2), it is still better to load a constant vector
> + // from the constant pool and not to broadcast it from a scalar.
> + // But override that restriction when optimizing for size.
> + // TODO: Check if splatting is recommended for other AVX-capable CPUs.
> + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
> + EVT CVT = Ld.getValueType();
> + assert(!CVT.isVector() && "Must not broadcast a vector type");
>
> - unsigned NumElems = VT.getVectorNumElements();
> + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
> + // For size optimization, also splat v2f64 and v2i64, and for size opt
> + // with AVX2, also splat i8 and i16.
> + // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
> + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
> + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
> + const Constant *C = nullptr;
> + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
> + C = CI->getConstantIntValue();
> + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
> + C = CF->getConstantFPValue();
>
> - if (NumElems != 2 && NumElems != 4)
> - return false;
> - for (unsigned i = 0, e = NumElems/2; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i))
> - return false;
> - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
> - if (!isUndefOrEqual(Mask[i], i+NumElems))
> - return false;
> - return true;
> -}
> + assert(C && "Invalid constant type");
>
> -/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
> -/// to an zero vector.
> -/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
> -static bool isZeroShuffle(ShuffleVectorSDNode *N) {
> - SDValue V1 = N->getOperand(0);
> - SDValue V2 = N->getOperand(1);
> - unsigned NumElems = N->getValueType(0).getVectorNumElements();
> - for (unsigned i = 0; i != NumElems; ++i) {
> - int Idx = N->getMaskElt(i);
> - if (Idx >= (int)NumElems) {
> - unsigned Opc = V2.getOpcode();
> - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
> - continue;
> - if (Opc != ISD::BUILD_VECTOR ||
> - !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
> - return false;
> - } else if (Idx >= 0) {
> - unsigned Opc = V1.getOpcode();
> - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
> - continue;
> - if (Opc != ISD::BUILD_VECTOR ||
> - !X86::isZeroNode(V1.getOperand(Idx)))
> - return false;
> + const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
> + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
> + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
> + MachinePointerInfo::getConstantPool(),
> + false, false, false, Alignment);
> +
> + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> }
> }
> - return true;
> -}
>
> -/// getZeroVector - Returns a vector of specified type with all zero elements.
> -///
> -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG, SDLoc dl) {
> - assert(VT.isVector() && "Expected a vector type");
> + bool IsLoad = ISD::isNormalLoad(Ld.getNode());
>
> - // Always build SSE zero vectors as <4 x i32> bitcasted
> - // to their dest type. This ensures they get CSE'd.
> - SDValue Vec;
> - if (VT.is128BitVector()) { // SSE
> - if (Subtarget->hasSSE2()) { // SSE2
> - SDValue Cst = DAG.getConstant(0, MVT::i32);
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> - } else { // SSE1
> - SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
> - }
> - } else if (VT.is256BitVector()) { // AVX
> - if (Subtarget->hasInt256()) { // AVX2
> - SDValue Cst = DAG.getConstant(0, MVT::i32);
> - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
> - } else {
> - // 256-bit logic and arithmetic instructions in AVX are all
> - // floating-point, no support for integer ops. Emit fp zeroed vectors.
> - SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
> - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
> - }
> - } else if (VT.is512BitVector()) { // AVX-512
> - SDValue Cst = DAG.getConstant(0, MVT::i32);
> - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
> - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
> - } else if (VT.getScalarType() == MVT::i1) {
> - assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
> - SDValue Cst = DAG.getConstant(0, MVT::i1);
> - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> - } else
> - llvm_unreachable("Unexpected vector type");
> + // Handle AVX2 in-register broadcasts.
> + if (!IsLoad && Subtarget->hasInt256() &&
> + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
> + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
>
> - return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
> -}
> + // The scalar source must be a normal load.
> + if (!IsLoad)
> + return SDValue();
>
> -/// getOnesVector - Returns a vector of specified type with all bits set.
> -/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
> -/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
> -/// Then bitcast to their original type, ensuring they get CSE'd.
> -static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
> - SDLoc dl) {
> - assert(VT.isVector() && "Expected a vector type");
> + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
> + (Subtarget->hasVLX() && ScalarSize == 64))
> + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
>
> - SDValue Cst = DAG.getConstant(~0U, MVT::i32);
> - SDValue Vec;
> - if (VT.is256BitVector()) {
> - if (HasInt256) { // AVX2
> - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
> - } else { // AVX
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> - Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
> - }
> - } else if (VT.is128BitVector()) {
> - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
> - } else
> - llvm_unreachable("Unexpected vector type");
> + // The integer check is needed for the 64-bit into 128-bit so it doesn't match
> + // double since there is no vbroadcastsd xmm
> + if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
> + if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
> + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> + }
>
> - return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
> + // Unsupported broadcast.
> + return SDValue();
> }
>
> -/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
> -/// that point to V2 points to its first element.
> -static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
> - for (unsigned i = 0; i != NumElems; ++i) {
> - if (Mask[i] > (int)NumElems) {
> - Mask[i] = NumElems;
> - }
> - }
> -}
> +/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
> +/// underlying vector and index.
> +///
> +/// Modifies \p ExtractedFromVec to the real vector and returns the real
> +/// index.
> +static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
> + SDValue ExtIdx) {
> + int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
> + if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
> + return Idx;
>
> -/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
> -/// operation of specified width.
> -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
> - SDValue V2) {
> - unsigned NumElems = VT.getVectorNumElements();
> - SmallVector<int, 8> Mask;
> - Mask.push_back(NumElems);
> - for (unsigned i = 1; i != NumElems; ++i)
> - Mask.push_back(i);
> - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> -}
> + // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
> + // lowered this:
> + // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
> + // to:
> + // (extract_vector_elt (vector_shuffle<2,u,u,u>
> + // (extract_subvector (v8f32 %vreg0), Constant<4>),
> + // undef)
> + // Constant<0>)
> + // In this case the vector is the extract_subvector expression and the index
> + // is 2, as specified by the shuffle.
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
> + SDValue ShuffleVec = SVOp->getOperand(0);
> + MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
> + assert(ShuffleVecVT.getVectorElementType() ==
> + ExtractedFromVec.getSimpleValueType().getVectorElementType());
>
> -/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
> -static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
> - SDValue V2) {
> - unsigned NumElems = VT.getVectorNumElements();
> - SmallVector<int, 8> Mask;
> - for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
> - Mask.push_back(i);
> - Mask.push_back(i + NumElems);
> + int ShuffleIdx = SVOp->getMaskElt(Idx);
> + if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
> + ExtractedFromVec = ShuffleVec;
> + return ShuffleIdx;
> }
> - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> + return Idx;
> }
>
> -/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
> -static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
> - SDValue V2) {
> - unsigned NumElems = VT.getVectorNumElements();
> - SmallVector<int, 8> Mask;
> - for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
> - Mask.push_back(i + Half);
> - Mask.push_back(i + NumElems + Half);
> - }
> - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
> -}
> +static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
> + MVT VT = Op.getSimpleValueType();
>
> -// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
> -// a generic shuffle instruction because the target has no such instructions.
> -// Generate shuffles which repeat i16 and i8 several times until they can be
> -// represented by v4f32 and then be manipulated by target suported shuffles.
> -static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
> - MVT VT = V.getSimpleValueType();
> - int NumElems = VT.getVectorNumElements();
> - SDLoc dl(V);
> + // Skip if insert_vec_elt is not supported.
> + const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> + if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
> + return SDValue();
>
> - while (NumElems > 4) {
> - if (EltNo < NumElems/2) {
> - V = getUnpackl(DAG, dl, VT, V, V);
> - } else {
> - V = getUnpackh(DAG, dl, VT, V, V);
> - EltNo -= NumElems/2;
> - }
> - NumElems >>= 1;
> - }
> - return V;
> -}
> + SDLoc DL(Op);
> + unsigned NumElems = Op.getNumOperands();
>
> -/// getLegalSplat - Generate a legal splat with supported x86 shuffles
> -static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
> - MVT VT = V.getSimpleValueType();
> - SDLoc dl(V);
> + SDValue VecIn1;
> + SDValue VecIn2;
> + SmallVector<unsigned, 4> InsertIndices;
> + SmallVector<int, 8> Mask(NumElems, -1);
>
> - if (VT.is128BitVector()) {
> - V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
> - int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
> - V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
> - &SplatMask[0]);
> - } else if (VT.is256BitVector()) {
> - // To use VPERMILPS to splat scalars, the second half of indicies must
> - // refer to the higher part, which is a duplication of the lower one,
> - // because VPERMILPS can only handle in-lane permutations.
> - int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
> - EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
> -
> - V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
> - V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
> - &SplatMask[0]);
> - } else
> - llvm_unreachable("Vector size not supported");
> + for (unsigned i = 0; i != NumElems; ++i) {
> + unsigned Opc = Op.getOperand(i).getOpcode();
>
> - return DAG.getNode(ISD::BITCAST, dl, VT, V);
> -}
> + if (Opc == ISD::UNDEF)
> + continue;
> +
> + if (Opc != ISD::EXTRACT_VECTOR_ELT) {
> + // Quit if more than 1 elements need inserting.
> + if (InsertIndices.size() > 1)
> + return SDValue();
> +
> + InsertIndices.push_back(i);
> + continue;
> + }
>
> -/// PromoteSplat - Splat is promoted to target supported vector shuffles.
> -static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
> - MVT SrcVT = SV->getSimpleValueType(0);
> - SDValue V1 = SV->getOperand(0);
> - SDLoc dl(SV);
> + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
> + SDValue ExtIdx = Op.getOperand(i).getOperand(1);
> + // Quit if non-constant index.
> + if (!isa<ConstantSDNode>(ExtIdx))
> + return SDValue();
> + int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
>
> - int EltNo = SV->getSplatIndex();
> - int NumElems = SrcVT.getVectorNumElements();
> - bool Is256BitVec = SrcVT.is256BitVector();
> + // Quit if extracted from vector of different type.
> + if (ExtractedFromVec.getValueType() != VT)
> + return SDValue();
>
> - assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
> - "Unknown how to promote splat for type");
> + if (!VecIn1.getNode())
> + VecIn1 = ExtractedFromVec;
> + else if (VecIn1 != ExtractedFromVec) {
> + if (!VecIn2.getNode())
> + VecIn2 = ExtractedFromVec;
> + else if (VecIn2 != ExtractedFromVec)
> + // Quit if more than 2 vectors to shuffle
> + return SDValue();
> + }
>
> - // Extract the 128-bit part containing the splat element and update
> - // the splat element index when it refers to the higher register.
> - if (Is256BitVec) {
> - V1 = Extract128BitVector(V1, EltNo, DAG, dl);
> - if (EltNo >= NumElems/2)
> - EltNo -= NumElems/2;
> + if (ExtractedFromVec == VecIn1)
> + Mask[i] = Idx;
> + else if (ExtractedFromVec == VecIn2)
> + Mask[i] = Idx + NumElems;
> }
>
> - // All i16 and i8 vector types can't be used directly by a generic shuffle
> - // instruction because the target has no such instruction. Generate shuffles
> - // which repeat i16 and i8 several times until they fit in i32, and then can
> - // be manipulated by target suported shuffles.
> - MVT EltVT = SrcVT.getVectorElementType();
> - if (EltVT == MVT::i8 || EltVT == MVT::i16)
> - V1 = PromoteSplati8i16(V1, DAG, EltNo);
> + if (!VecIn1.getNode())
> + return SDValue();
>
> - // Recreate the 256-bit vector and place the same 128-bit vector
> - // into the low and high part. This is necessary because we want
> - // to use VPERM* to shuffle the vectors
> - if (Is256BitVec) {
> - V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
> + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
> + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
> + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
> + unsigned Idx = InsertIndices[i];
> + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
> + DAG.getIntPtrConstant(Idx));
> }
>
> - return getLegalSplat(DAG, V1, EltNo);
> + return NV;
> }
>
> -/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
> -/// vector of zero or undef vector. This produces a shuffle where the low
> -/// element of V2 is swizzled into the zero/undef vector, landing at element
> -/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
> -static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
> - bool IsZero,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - MVT VT = V2.getSimpleValueType();
> - SDValue V1 = IsZero
> - ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
> - unsigned NumElems = VT.getVectorNumElements();
> - SmallVector<int, 16> MaskVec;
> - for (unsigned i = 0; i != NumElems; ++i)
> - // If this is the insertion idx, put the low elt of V2 here.
> - MaskVec.push_back(i == Idx ? NumElems : i);
> - return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
> -}
> +// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
> +SDValue
> +X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
>
> -/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
> -/// target specific opcode. Returns true if the Mask could be calculated. Sets
> -/// IsUnary to true if only uses one source. Note that this will set IsUnary for
> -/// shuffles which use a single input multiple times, and in those cases it will
> -/// adjust the mask to only have indices within that single input.
> -static bool getTargetShuffleMask(SDNode *N, MVT VT,
> - SmallVectorImpl<int> &Mask, bool &IsUnary) {
> - unsigned NumElems = VT.getVectorNumElements();
> - SDValue ImmN;
> + MVT VT = Op.getSimpleValueType();
> + assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
> + "Unexpected type in LowerBUILD_VECTORvXi1!");
>
> - IsUnary = false;
> - bool IsFakeUnary = false;
> - switch(N->getOpcode()) {
> - case X86ISD::BLENDI:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - break;
> - case X86ISD::SHUFP:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> - break;
> - case X86ISD::UNPCKH:
> - DecodeUNPCKHMask(VT, Mask);
> - IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> - break;
> - case X86ISD::UNPCKL:
> - DecodeUNPCKLMask(VT, Mask);
> - IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> - break;
> - case X86ISD::MOVHLPS:
> - DecodeMOVHLPSMask(NumElems, Mask);
> - IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> - break;
> - case X86ISD::MOVLHPS:
> - DecodeMOVLHPSMask(NumElems, Mask);
> - IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
> - break;
> - case X86ISD::PALIGNR:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - break;
> - case X86ISD::PSHUFD:
> - case X86ISD::VPERMILPI:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::PSHUFHW:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::PSHUFLW:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::PSHUFB: {
> - IsUnary = true;
> - SDValue MaskNode = N->getOperand(1);
> - while (MaskNode->getOpcode() == ISD::BITCAST)
> - MaskNode = MaskNode->getOperand(0);
> + SDLoc dl(Op);
> + if (ISD::isBuildVectorAllZeros(Op.getNode())) {
> + SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
> + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> + }
>
> - if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
> - // If we have a build-vector, then things are easy.
> - EVT VT = MaskNode.getValueType();
> - assert(VT.isVector() &&
> - "Can't produce a non-vector with a build_vector!");
> - if (!VT.isInteger())
> - return false;
> + if (ISD::isBuildVectorAllOnes(Op.getNode())) {
> + SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
> + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> + }
>
> - int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
> + bool AllContants = true;
> + uint64_t Immediate = 0;
> + int NonConstIdx = -1;
> + bool IsSplat = true;
> + unsigned NumNonConsts = 0;
> + unsigned NumConsts = 0;
> + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
> + SDValue In = Op.getOperand(idx);
> + if (In.getOpcode() == ISD::UNDEF)
> + continue;
> + if (!isa<ConstantSDNode>(In)) {
> + AllContants = false;
> + NonConstIdx = idx;
> + NumNonConsts++;
> + } else {
> + NumConsts++;
> + if (cast<ConstantSDNode>(In)->getZExtValue())
> + Immediate |= (1ULL << idx);
> + }
> + if (In != Op.getOperand(0))
> + IsSplat = false;
> + }
>
> - SmallVector<uint64_t, 32> RawMask;
> - for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
> - SDValue Op = MaskNode->getOperand(i);
> - if (Op->getOpcode() == ISD::UNDEF) {
> - RawMask.push_back((uint64_t)SM_SentinelUndef);
> - continue;
> - }
> - auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
> - if (!CN)
> - return false;
> - APInt MaskElement = CN->getAPIntValue();
> + if (AllContants) {
> + SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
> + DAG.getConstant(Immediate, MVT::i16));
> + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
> + DAG.getIntPtrConstant(0));
> + }
>
> - // We now have to decode the element which could be any integer size and
> - // extract each byte of it.
> - for (int j = 0; j < NumBytesPerElement; ++j) {
> - // Note that this is x86 and so always little endian: the low byte is
> - // the first byte of the mask.
> - RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
> - MaskElement = MaskElement.lshr(8);
> - }
> - }
> - DecodePSHUFBMask(RawMask, Mask);
> - break;
> + if (NumNonConsts == 1 && NonConstIdx != 0) {
> + SDValue DstVec;
> + if (NumConsts) {
> + SDValue VecAsImm = DAG.getConstant(Immediate,
> + MVT::getIntegerVT(VT.getSizeInBits()));
> + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
> }
> + else
> + DstVec = DAG.getUNDEF(VT);
> + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
> + Op.getOperand(NonConstIdx),
> + DAG.getIntPtrConstant(NonConstIdx));
> + }
> + if (!IsSplat && (NonConstIdx != 0))
> + llvm_unreachable("Unsupported BUILD_VECTOR operation");
> + MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
> + SDValue Select;
> + if (IsSplat)
> + Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
> + DAG.getConstant(-1, SelectVT),
> + DAG.getConstant(0, SelectVT));
> + else
> + Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
> + DAG.getConstant((Immediate | 1), SelectVT),
> + DAG.getConstant(Immediate, SelectVT));
> + return DAG.getNode(ISD::BITCAST, dl, VT, Select);
> +}
>
> - auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
> - if (!MaskLoad)
> - return false;
> +/// \brief Return true if \p N implements a horizontal binop and return the
> +/// operands for the horizontal binop into V0 and V1.
> +///
> +/// This is a helper function of PerformBUILD_VECTORCombine.
> +/// This function checks that the build_vector \p N in input implements a
> +/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
> +/// operation to match.
> +/// For example, if \p Opcode is equal to ISD::ADD, then this function
> +/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
> +/// is equal to ISD::SUB, then this function checks if this is a horizontal
> +/// arithmetic sub.
> +///
> +/// This function only analyzes elements of \p N whose indices are
> +/// in range [BaseIdx, LastIdx).
> +static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
> + SelectionDAG &DAG,
> + unsigned BaseIdx, unsigned LastIdx,
> + SDValue &V0, SDValue &V1) {
> + EVT VT = N->getValueType(0);
>
> - SDValue Ptr = MaskLoad->getBasePtr();
> - if (Ptr->getOpcode() == X86ISD::Wrapper)
> - Ptr = Ptr->getOperand(0);
> + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
> + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
> + "Invalid Vector in input!");
>
> - auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
> - if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
> - return false;
> + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
> + bool CanFold = true;
> + unsigned ExpectedVExtractIdx = BaseIdx;
> + unsigned NumElts = LastIdx - BaseIdx;
> + V0 = DAG.getUNDEF(VT);
> + V1 = DAG.getUNDEF(VT);
>
> - if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
> - DecodePSHUFBMask(C, Mask);
> - if (Mask.empty())
> - return false;
> - break;
> + // Check if N implements a horizontal binop.
> + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
> + SDValue Op = N->getOperand(i + BaseIdx);
> +
> + // Skip UNDEFs.
> + if (Op->getOpcode() == ISD::UNDEF) {
> + // Update the expected vector extract index.
> + if (i * 2 == NumElts)
> + ExpectedVExtractIdx = BaseIdx;
> + ExpectedVExtractIdx += 2;
> + continue;
> }
>
> - return false;
> - }
> - case X86ISD::VPERMI:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::MOVSS:
> - case X86ISD::MOVSD:
> - DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
> - break;
> - case X86ISD::VPERM2X128:
> - ImmN = N->getOperand(N->getNumOperands()-1);
> - DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
> - if (Mask.empty()) return false;
> - break;
> - case X86ISD::MOVSLDUP:
> - DecodeMOVSLDUPMask(VT, Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::MOVSHDUP:
> - DecodeMOVSHDUPMask(VT, Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::MOVDDUP:
> - DecodeMOVDDUPMask(VT, Mask);
> - IsUnary = true;
> - break;
> - case X86ISD::MOVLHPD:
> - case X86ISD::MOVLPD:
> - case X86ISD::MOVLPS:
> - // Not yet implemented
> - return false;
> - default: llvm_unreachable("unknown target shuffle node");
> - }
> + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
>
> - // If we have a fake unary shuffle, the shuffle mask is spread across two
> - // inputs that are actually the same node. Re-map the mask to always point
> - // into the first input.
> - if (IsFakeUnary)
> - for (int &M : Mask)
> - if (M >= (int)Mask.size())
> - M -= Mask.size();
> + if (!CanFold)
> + break;
>
> - return true;
> -}
> + SDValue Op0 = Op.getOperand(0);
> + SDValue Op1 = Op.getOperand(1);
>
> -/// getShuffleScalarElt - Returns the scalar element that will make up the ith
> -/// element of the result of the vector shuffle.
> -static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
> - unsigned Depth) {
> - if (Depth == 6)
> - return SDValue(); // Limit search depth.
> + // Try to match the following pattern:
> + // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
> + CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
> + Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
> + Op0.getOperand(0) == Op1.getOperand(0) &&
> + isa<ConstantSDNode>(Op0.getOperand(1)) &&
> + isa<ConstantSDNode>(Op1.getOperand(1)));
> + if (!CanFold)
> + break;
>
> - SDValue V = SDValue(N, 0);
> - EVT VT = V.getValueType();
> - unsigned Opcode = V.getOpcode();
> + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
> + unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
>
> - // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
> - if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
> - int Elt = SV->getMaskElt(Index);
> + if (i * 2 < NumElts) {
> + if (V0.getOpcode() == ISD::UNDEF)
> + V0 = Op0.getOperand(0);
> + } else {
> + if (V1.getOpcode() == ISD::UNDEF)
> + V1 = Op0.getOperand(0);
> + if (i * 2 == NumElts)
> + ExpectedVExtractIdx = BaseIdx;
> + }
>
> - if (Elt < 0)
> - return DAG.getUNDEF(VT.getVectorElementType());
> + SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
> + if (I0 == ExpectedVExtractIdx)
> + CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
> + else if (IsCommutable && I1 == ExpectedVExtractIdx) {
> + // Try to match the following dag sequence:
> + // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
> + CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
> + } else
> + CanFold = false;
>
> - unsigned NumElems = VT.getVectorNumElements();
> - SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
> - : SV->getOperand(1);
> - return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
> + ExpectedVExtractIdx += 2;
> }
>
> - // Recurse into target specific vector shuffles to find scalars.
> - if (isTargetShuffle(Opcode)) {
> - MVT ShufVT = V.getSimpleValueType();
> - unsigned NumElems = ShufVT.getVectorNumElements();
> - SmallVector<int, 16> ShuffleMask;
> - bool IsUnary;
> + return CanFold;
> +}
>
> - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
> - return SDValue();
> +/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
> +/// a concat_vector.
> +///
> +/// This is a helper function of PerformBUILD_VECTORCombine.
> +/// This function expects two 256-bit vectors called V0 and V1.
> +/// At first, each vector is split into two separate 128-bit vectors.
> +/// Then, the resulting 128-bit vectors are used to implement two
> +/// horizontal binary operations.
> +///
> +/// The kind of horizontal binary operation is defined by \p X86Opcode.
> +///
> +/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
> +/// the two new horizontal binop.
> +/// When Mode is set, the first horizontal binop dag node would take as input
> +/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
> +/// horizontal binop dag node would take as input the lower 128-bit of V1
> +/// and the upper 128-bit of V1.
> +/// Example:
> +/// HADD V0_LO, V0_HI
> +/// HADD V1_LO, V1_HI
> +///
> +/// Otherwise, the first horizontal binop dag node takes as input the lower
> +/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
> +/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
> +/// Example:
> +/// HADD V0_LO, V1_LO
> +/// HADD V0_HI, V1_HI
> +///
> +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
> +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
> +/// the upper 128-bits of the result.
> +static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
> + SDLoc DL, SelectionDAG &DAG,
> + unsigned X86Opcode, bool Mode,
> + bool isUndefLO, bool isUndefHI) {
> + EVT VT = V0.getValueType();
> + assert(VT.is256BitVector() && VT == V1.getValueType() &&
> + "Invalid nodes in input!");
>
> - int Elt = ShuffleMask[Index];
> - if (Elt < 0)
> - return DAG.getUNDEF(ShufVT.getVectorElementType());
> + unsigned NumElts = VT.getVectorNumElements();
> + SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
> + SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
> + SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
> + SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
> + EVT NewVT = V0_LO.getValueType();
>
> - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
> - : N->getOperand(1);
> - return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
> - Depth+1);
> - }
> + SDValue LO = DAG.getUNDEF(NewVT);
> + SDValue HI = DAG.getUNDEF(NewVT);
>
> - // Actual nodes that may contain scalar elements
> - if (Opcode == ISD::BITCAST) {
> - V = V.getOperand(0);
> - EVT SrcVT = V.getValueType();
> - unsigned NumElems = VT.getVectorNumElements();
> + if (Mode) {
> + // Don't emit a horizontal binop if the result is expected to be UNDEF.
> + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
> + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
> + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
> + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
> + } else {
> + // Don't emit a horizontal binop if the result is expected to be UNDEF.
> + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
> + V1_LO->getOpcode() != ISD::UNDEF))
> + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
>
> - if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
> - return SDValue();
> + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
> + V1_HI->getOpcode() != ISD::UNDEF))
> + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
> }
>
> - if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
> - return (Index == 0) ? V.getOperand(0)
> - : DAG.getUNDEF(VT.getVectorElementType());
> -
> - if (V.getOpcode() == ISD::BUILD_VECTOR)
> - return V.getOperand(Index);
> -
> - return SDValue();
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
> }
>
> -/// getNumOfConsecutiveZeros - Return the number of elements of a vector
> -/// shuffle operation which come from a consecutively from a zero. The
> -/// search can start in two different directions, from left or right.
> -/// We count undefs as zeros until PreferredNum is reached.
> -static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
> - unsigned NumElems, bool ZerosFromLeft,
> - SelectionDAG &DAG,
> - unsigned PreferredNum = -1U) {
> - unsigned NumZeros = 0;
> - for (unsigned i = 0; i != NumElems; ++i) {
> - unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
> - SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
> - if (!Elt.getNode())
> - break;
> -
> - if (X86::isZeroNode(Elt))
> - ++NumZeros;
> - else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
> - NumZeros = std::min(NumZeros + 1, PreferredNum);
> - else
> - break;
> - }
> +/// \brief Try to fold a build_vector that performs an 'addsub' into the
> +/// sequence of 'vadd + vsub + blendi'.
> +static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
> + const X86Subtarget *Subtarget) {
> + SDLoc DL(BV);
> + EVT VT = BV->getValueType(0);
> + unsigned NumElts = VT.getVectorNumElements();
> + SDValue InVec0 = DAG.getUNDEF(VT);
> + SDValue InVec1 = DAG.getUNDEF(VT);
>
> - return NumZeros;
> -}
> + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
> + VT == MVT::v2f64) && "build_vector with an invalid type found!");
>
> -/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
> -/// correspond consecutively to elements from one of the vector operands,
> -/// starting from its index OpIdx. Also tell OpNum which source vector operand.
> -static
> -bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
> - unsigned MaskI, unsigned MaskE, unsigned OpIdx,
> - unsigned NumElems, unsigned &OpNum) {
> - bool SeenV1 = false;
> - bool SeenV2 = false;
> -
> - for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
> - int Idx = SVOp->getMaskElt(i);
> - // Ignore undef indicies
> - if (Idx < 0)
> - continue;
> + // Odd-numbered elements in the input build vector are obtained from
> + // adding two integer/float elements.
> + // Even-numbered elements in the input build vector are obtained from
> + // subtracting two integer/float elements.
> + unsigned ExpectedOpcode = ISD::FSUB;
> + unsigned NextExpectedOpcode = ISD::FADD;
> + bool AddFound = false;
> + bool SubFound = false;
>
> - if (Idx < (int)NumElems)
> - SeenV1 = true;
> - else
> - SeenV2 = true;
> + for (unsigned i = 0, e = NumElts; i != e; ++i) {
> + SDValue Op = BV->getOperand(i);
>
> - // Only accept consecutive elements from the same vector
> - if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
> - return false;
> - }
> + // Skip 'undef' values.
> + unsigned Opcode = Op.getOpcode();
> + if (Opcode == ISD::UNDEF) {
> + std::swap(ExpectedOpcode, NextExpectedOpcode);
> + continue;
> + }
>
> - OpNum = SeenV1 ? 0 : 1;
> - return true;
> -}
> + // Early exit if we found an unexpected opcode.
> + if (Opcode != ExpectedOpcode)
> + return SDValue();
>
> -/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
> -/// logical left shift of a vector.
> -static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
> - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
> - unsigned NumElems =
> - SVOp->getSimpleValueType(0).getVectorNumElements();
> - unsigned NumZeros = getNumOfConsecutiveZeros(
> - SVOp, NumElems, false /* check zeros from right */, DAG,
> - SVOp->getMaskElt(0));
> - unsigned OpSrc;
> + SDValue Op0 = Op.getOperand(0);
> + SDValue Op1 = Op.getOperand(1);
>
> - if (!NumZeros)
> - return false;
> + // Try to match the following pattern:
> + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
> + // Early exit if we cannot match that sequence.
> + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> + !isa<ConstantSDNode>(Op0.getOperand(1)) ||
> + !isa<ConstantSDNode>(Op1.getOperand(1)) ||
> + Op0.getOperand(1) != Op1.getOperand(1))
> + return SDValue();
>
> - // Considering the elements in the mask that are not consecutive zeros,
> - // check if they consecutively come from only one of the source vectors.
> - //
> - // V1 = {X, A, B, C} 0
> - // \ \ \ /
> - // vector_shuffle V1, V2 <1, 2, 3, X>
> - //
> - if (!isShuffleMaskConsecutive(SVOp,
> - 0, // Mask Start Index
> - NumElems-NumZeros, // Mask End Index(exclusive)
> - NumZeros, // Where to start looking in the src vector
> - NumElems, // Number of elements in vector
> - OpSrc)) // Which source operand ?
> - return false;
> + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
> + if (I0 != i)
> + return SDValue();
>
> - isLeft = false;
> - ShAmt = NumZeros;
> - ShVal = SVOp->getOperand(OpSrc);
> - return true;
> -}
> + // We found a valid add/sub node. Update the information accordingly.
> + if (i & 1)
> + AddFound = true;
> + else
> + SubFound = true;
>
> -/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
> -/// logical left shift of a vector.
> -static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
> - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
> - unsigned NumElems =
> - SVOp->getSimpleValueType(0).getVectorNumElements();
> - unsigned NumZeros = getNumOfConsecutiveZeros(
> - SVOp, NumElems, true /* check zeros from left */, DAG,
> - NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
> - unsigned OpSrc;
> + // Update InVec0 and InVec1.
> + if (InVec0.getOpcode() == ISD::UNDEF)
> + InVec0 = Op0.getOperand(0);
> + if (InVec1.getOpcode() == ISD::UNDEF)
> + InVec1 = Op1.getOperand(0);
>
> - if (!NumZeros)
> - return false;
> + // Make sure that operands in input to each add/sub node always
> + // come from a same pair of vectors.
> + if (InVec0 != Op0.getOperand(0)) {
> + if (ExpectedOpcode == ISD::FSUB)
> + return SDValue();
>
> - // Considering the elements in the mask that are not consecutive zeros,
> - // check if they consecutively come from only one of the source vectors.
> - //
> - // 0 { A, B, X, X } = V2
> - // / \ / /
> - // vector_shuffle V1, V2 <X, X, 4, 5>
> - //
> - if (!isShuffleMaskConsecutive(SVOp,
> - NumZeros, // Mask Start Index
> - NumElems, // Mask End Index(exclusive)
> - 0, // Where to start looking in the src vector
> - NumElems, // Number of elements in vector
> - OpSrc)) // Which source operand ?
> - return false;
> + // FADD is commutable. Try to commute the operands
> + // and then test again.
> + std::swap(Op0, Op1);
> + if (InVec0 != Op0.getOperand(0))
> + return SDValue();
> + }
>
> - isLeft = true;
> - ShAmt = NumZeros;
> - ShVal = SVOp->getOperand(OpSrc);
> - return true;
> -}
> + if (InVec1 != Op1.getOperand(0))
> + return SDValue();
>
> -/// isVectorShift - Returns true if the shuffle can be implemented as a
> -/// logical left or right shift of a vector.
> -static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
> - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
> - // Although the logic below support any bitwidth size, there are no
> - // shift instructions which handle more than 128-bit vectors.
> - if (!SVOp->getSimpleValueType(0).is128BitVector())
> - return false;
> + // Update the pair of expected opcodes.
> + std::swap(ExpectedOpcode, NextExpectedOpcode);
> + }
>
> - if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
> - isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
> - return true;
> + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
> + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
> + InVec1.getOpcode() != ISD::UNDEF)
> + return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
>
> - return false;
> + return SDValue();
> }
>
> -/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
> -///
> -static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
> - unsigned NumNonZero, unsigned NumZero,
> - SelectionDAG &DAG,
> - const X86Subtarget* Subtarget,
> - const TargetLowering &TLI) {
> - if (NumNonZero > 8)
> - return SDValue();
> +static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
> + const X86Subtarget *Subtarget) {
> + SDLoc DL(N);
> + EVT VT = N->getValueType(0);
> + unsigned NumElts = VT.getVectorNumElements();
> + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
> + SDValue InVec0, InVec1;
>
> - SDLoc dl(Op);
> - SDValue V;
> - bool First = true;
> - for (unsigned i = 0; i < 16; ++i) {
> - bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
> - if (ThisIsNonZero && First) {
> - if (NumZero)
> - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
> - else
> - V = DAG.getUNDEF(MVT::v8i16);
> - First = false;
> - }
> + // Try to match an ADDSUB.
> + if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
> + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
> + SDValue Value = matchAddSub(BV, DAG, Subtarget);
> + if (Value.getNode())
> + return Value;
> + }
>
> - if ((i & 1) != 0) {
> - SDValue ThisElt, LastElt;
> - bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
> - if (LastIsNonZero) {
> - LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
> - MVT::i16, Op.getOperand(i-1));
> - }
> - if (ThisIsNonZero) {
> - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
> - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
> - ThisElt, DAG.getConstant(8, MVT::i8));
> - if (LastIsNonZero)
> - ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
> - } else
> - ThisElt = LastElt;
> + // Try to match horizontal ADD/SUB.
> + unsigned NumUndefsLO = 0;
> + unsigned NumUndefsHI = 0;
> + unsigned Half = NumElts/2;
>
> - if (ThisElt.getNode())
> - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
> - DAG.getIntPtrConstant(i/2));
> - }
> - }
> + // Count the number of UNDEF operands in the build_vector in input.
> + for (unsigned i = 0, e = Half; i != e; ++i)
> + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
> + NumUndefsLO++;
>
> - return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
> -}
> + for (unsigned i = Half, e = NumElts; i != e; ++i)
> + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
> + NumUndefsHI++;
>
> -/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
> -///
> -static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
> - unsigned NumNonZero, unsigned NumZero,
> - SelectionDAG &DAG,
> - const X86Subtarget* Subtarget,
> - const TargetLowering &TLI) {
> - if (NumNonZero > 4)
> + // Early exit if this is either a build_vector of all UNDEFs or all the
> + // operands but one are UNDEF.
> + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
> return SDValue();
>
> - SDLoc dl(Op);
> - SDValue V;
> - bool First = true;
> - for (unsigned i = 0; i < 8; ++i) {
> - bool isNonZero = (NonZeros & (1 << i)) != 0;
> - if (isNonZero) {
> - if (First) {
> - if (NumZero)
> - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
> - else
> - V = DAG.getUNDEF(MVT::v8i16);
> - First = false;
> - }
> - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
> - MVT::v8i16, V, Op.getOperand(i),
> - DAG.getIntPtrConstant(i));
> - }
> - }
> -
> - return V;
> -}
> + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
> + // Try to match an SSE3 float HADD/HSUB.
> + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
> + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
>
> -/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
> -static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
> - const X86Subtarget *Subtarget,
> - const TargetLowering &TLI) {
> - // Find all zeroable elements.
> - std::bitset<4> Zeroable;
> - for (int i=0; i < 4; ++i) {
> - SDValue Elt = Op->getOperand(i);
> - Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
> - }
> - assert(Zeroable.size() - Zeroable.count() > 1 &&
> - "We expect at least two non-zero elements!");
> + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
> + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
> + } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
> + // Try to match an SSSE3 integer HADD/HSUB.
> + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
> + return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
>
> - // We only know how to deal with build_vector nodes where elements are either
> - // zeroable or extract_vector_elt with constant index.
> - SDValue FirstNonZero;
> - unsigned FirstNonZeroIdx;
> - for (unsigned i=0; i < 4; ++i) {
> - if (Zeroable[i])
> - continue;
> - SDValue Elt = Op->getOperand(i);
> - if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> - !isa<ConstantSDNode>(Elt.getOperand(1)))
> - return SDValue();
> - // Make sure that this node is extracting from a 128-bit vector.
> - MVT VT = Elt.getOperand(0).getSimpleValueType();
> - if (!VT.is128BitVector())
> - return SDValue();
> - if (!FirstNonZero.getNode()) {
> - FirstNonZero = Elt;
> - FirstNonZeroIdx = i;
> - }
> + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
> + return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
> }
>
> - assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
> - SDValue V1 = FirstNonZero.getOperand(0);
> - MVT VT = V1.getSimpleValueType();
> -
> - // See if this build_vector can be lowered as a blend with zero.
> - SDValue Elt;
> - unsigned EltMaskIdx, EltIdx;
> - int Mask[4];
> - for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
> - if (Zeroable[EltIdx]) {
> - // The zero vector will be on the right hand side.
> - Mask[EltIdx] = EltIdx+4;
> - continue;
> - }
> + if (!Subtarget->hasAVX())
> + return SDValue();
>
> - Elt = Op->getOperand(EltIdx);
> - // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
> - EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
> - if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
> - break;
> - Mask[EltIdx] = EltIdx;
> - }
> + if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
> + // Try to match an AVX horizontal add/sub of packed single/double
> + // precision floating point values from 256-bit vectors.
> + SDValue InVec2, InVec3;
> + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
> + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
> + ((InVec0.getOpcode() == ISD::UNDEF ||
> + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> + ((InVec1.getOpcode() == ISD::UNDEF ||
> + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
>
> - if (EltIdx == 4) {
> - // Let the shuffle legalizer deal with blend operations.
> - SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
> - if (V1.getSimpleValueType() != VT)
> - V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
> - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
> - }
> + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
> + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
> + ((InVec0.getOpcode() == ISD::UNDEF ||
> + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> + ((InVec1.getOpcode() == ISD::UNDEF ||
> + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
> + } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
> + // Try to match an AVX2 horizontal add/sub of signed integers.
> + SDValue InVec2, InVec3;
> + unsigned X86Opcode;
> + bool CanFold = true;
>
> - // See if we can lower this build_vector to a INSERTPS.
> - if (!Subtarget->hasSSE41())
> - return SDValue();
> + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
> + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
> + ((InVec0.getOpcode() == ISD::UNDEF ||
> + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> + ((InVec1.getOpcode() == ISD::UNDEF ||
> + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> + X86Opcode = X86ISD::HADD;
> + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
> + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
> + ((InVec0.getOpcode() == ISD::UNDEF ||
> + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> + ((InVec1.getOpcode() == ISD::UNDEF ||
> + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> + X86Opcode = X86ISD::HSUB;
> + else
> + CanFold = false;
>
> - SDValue V2 = Elt.getOperand(0);
> - if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
> - V1 = SDValue();
> + if (CanFold) {
> + // Fold this build_vector into a single horizontal add/sub.
> + // Do this only if the target has AVX2.
> + if (Subtarget->hasAVX2())
> + return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
>
> - bool CanFold = true;
> - for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
> - if (Zeroable[i])
> - continue;
> + // Do not try to expand this build_vector into a pair of horizontal
> + // add/sub if we can emit a pair of scalar add/sub.
> + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
> + return SDValue();
>
> - SDValue Current = Op->getOperand(i);
> - SDValue SrcVector = Current->getOperand(0);
> - if (!V1.getNode())
> - V1 = SrcVector;
> - CanFold = SrcVector == V1 &&
> - cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
> + // Convert this build_vector into a pair of horizontal binop followed by
> + // a concat vector.
> + bool isUndefLO = NumUndefsLO == Half;
> + bool isUndefHI = NumUndefsHI == Half;
> + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
> + isUndefLO, isUndefHI);
> + }
> }
>
> - if (!CanFold)
> - return SDValue();
> + if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
> + VT == MVT::v16i16) && Subtarget->hasAVX()) {
> + unsigned X86Opcode;
> + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
> + X86Opcode = X86ISD::HADD;
> + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
> + X86Opcode = X86ISD::HSUB;
> + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
> + X86Opcode = X86ISD::FHADD;
> + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
> + X86Opcode = X86ISD::FHSUB;
> + else
> + return SDValue();
>
> - assert(V1.getNode() && "Expected at least two non-zero elements!");
> - if (V1.getSimpleValueType() != MVT::v4f32)
> - V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
> - if (V2.getSimpleValueType() != MVT::v4f32)
> - V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
> -
> - // Ok, we can emit an INSERTPS instruction.
> - unsigned ZMask = Zeroable.to_ulong();
> + // Don't try to expand this build_vector into a pair of horizontal add/sub
> + // if we can simply emit a pair of scalar add/sub.
> + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
> + return SDValue();
>
> - unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
> - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
> - SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
> - DAG.getIntPtrConstant(InsertPSMask));
> - return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
> -}
> + // Convert this build_vector into two horizontal add/sub followed by
> + // a concat vector.
> + bool isUndefLO = NumUndefsLO == Half;
> + bool isUndefHI = NumUndefsHI == Half;
> + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
> + isUndefLO, isUndefHI);
> + }
>
> -/// Return a vector logical shift node.
> -static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
> - unsigned NumBits, SelectionDAG &DAG,
> - const TargetLowering &TLI, SDLoc dl) {
> - assert(VT.is128BitVector() && "Unknown type for VShift");
> - MVT ShVT = MVT::v2i64;
> - unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
> - SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
> - MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
> - assert(NumBits % 8 == 0 && "Only support byte sized shifts");
> - SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
> + return SDValue();
> }
>
> -static SDValue
> -LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
> -
> - // Check if the scalar load can be widened into a vector load. And if
> - // the address is "base + cst" see if the cst can be "absorbed" into
> - // the shuffle mask.
> - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
> - SDValue Ptr = LD->getBasePtr();
> - if (!ISD::isNormalLoad(LD) || LD->isVolatile())
> - return SDValue();
> - EVT PVT = LD->getValueType(0);
> - if (PVT != MVT::i32 && PVT != MVT::f32)
> - return SDValue();
> -
> - int FI = -1;
> - int64_t Offset = 0;
> - if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
> - FI = FINode->getIndex();
> - Offset = 0;
> - } else if (DAG.isBaseWithConstantOffset(Ptr) &&
> - isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
> - FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
> - Offset = Ptr.getConstantOperandVal(1);
> - Ptr = Ptr.getOperand(0);
> - } else {
> - return SDValue();
> - }
> +SDValue
> +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
> + SDLoc dl(Op);
>
> - // FIXME: 256-bit vector instructions don't require a strict alignment,
> - // improve this code to support it better.
> - unsigned RequiredAlign = VT.getSizeInBits()/8;
> - SDValue Chain = LD->getChain();
> - // Make sure the stack object alignment is at least 16 or 32.
> - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> - if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
> - if (MFI->isFixedObjectIndex(FI)) {
> - // Can't change the alignment. FIXME: It's possible to compute
> - // the exact stack offset and reference FI + adjust offset instead.
> - // If someone *really* cares about this. That's the way to implement it.
> - return SDValue();
> - } else {
> - MFI->setObjectAlignment(FI, RequiredAlign);
> - }
> - }
> + MVT VT = Op.getSimpleValueType();
> + MVT ExtVT = VT.getVectorElementType();
> + unsigned NumElems = Op.getNumOperands();
>
> - // (Offset % 16 or 32) must be multiple of 4. Then address is then
> - // Ptr + (Offset & ~15).
> - if (Offset < 0)
> - return SDValue();
> - if ((Offset % RequiredAlign) & 3)
> - return SDValue();
> - int64_t StartOffset = Offset & ~(RequiredAlign-1);
> - if (StartOffset)
> - Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
> - Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
> + // Generate vectors for predicate vectors.
> + if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
> + return LowerBUILD_VECTORvXi1(Op, DAG);
>
> - int EltNo = (Offset - StartOffset) >> 2;
> - unsigned NumElems = VT.getVectorNumElements();
> + // Vectors containing all zeros can be matched by pxor and xorps later
> + if (ISD::isBuildVectorAllZeros(Op.getNode())) {
> + // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
> + // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
> + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
> + return Op;
>
> - EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
> - SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
> - LD->getPointerInfo().getWithOffset(StartOffset),
> - false, false, false, 0);
> + return getZeroVector(VT, Subtarget, DAG, dl);
> + }
>
> - SmallVector<int, 8> Mask(NumElems, EltNo);
> + // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
> + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
> + // vpcmpeqd on 256-bit vectors.
> + if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
> + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
> + return Op;
>
> - return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
> + if (!VT.is512BitVector())
> + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
> }
>
> - return SDValue();
> -}
> -
> -/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
> -/// elements can be replaced by a single large load which has the same value as
> -/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
> -///
> -/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
> -///
> -/// FIXME: we'd also like to handle the case where the last elements are zero
> -/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
> -/// There's even a handy isZeroNode for that purpose.
> -static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
> - SDLoc &DL, SelectionDAG &DAG,
> - bool isAfterLegalize) {
> - unsigned NumElems = Elts.size();
> + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
> + if (Broadcast.getNode())
> + return Broadcast;
>
> - LoadSDNode *LDBase = nullptr;
> - unsigned LastLoadedElt = -1U;
> + unsigned EVTBits = ExtVT.getSizeInBits();
>
> - // For each element in the initializer, see if we've found a load or an undef.
> - // If we don't find an initial load element, or later load elements are
> - // non-consecutive, bail out.
> + unsigned NumZero = 0;
> + unsigned NumNonZero = 0;
> + unsigned NonZeros = 0;
> + bool IsAllConstants = true;
> + SmallSet<SDValue, 8> Values;
> for (unsigned i = 0; i < NumElems; ++i) {
> - SDValue Elt = Elts[i];
> - // Look through a bitcast.
> - if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
> - Elt = Elt.getOperand(0);
> - if (!Elt.getNode() ||
> - (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
> - return SDValue();
> - if (!LDBase) {
> - if (Elt.getNode()->getOpcode() == ISD::UNDEF)
> - return SDValue();
> - LDBase = cast<LoadSDNode>(Elt.getNode());
> - LastLoadedElt = i;
> - continue;
> - }
> + SDValue Elt = Op.getOperand(i);
> if (Elt.getOpcode() == ISD::UNDEF)
> continue;
> -
> - LoadSDNode *LD = cast<LoadSDNode>(Elt);
> - EVT LdVT = Elt.getValueType();
> - // Each loaded element must be the correct fractional portion of the
> - // requested vector load.
> - if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
> - return SDValue();
> - if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
> - return SDValue();
> - LastLoadedElt = i;
> + Values.insert(Elt);
> + if (Elt.getOpcode() != ISD::Constant &&
> + Elt.getOpcode() != ISD::ConstantFP)
> + IsAllConstants = false;
> + if (X86::isZeroNode(Elt))
> + NumZero++;
> + else {
> + NonZeros |= (1 << i);
> + NumNonZero++;
> + }
> }
>
> - // If we have found an entire vector of loads and undefs, then return a large
> - // load of the entire vector width starting at the base pointer. If we found
> - // consecutive loads for the low half, generate a vzext_load node.
> - if (LastLoadedElt == NumElems - 1) {
> - assert(LDBase && "Did not find base load for merging consecutive loads");
> - EVT EltVT = LDBase->getValueType(0);
> - // Ensure that the input vector size for the merged loads matches the
> - // cumulative size of the input elements.
> - if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
> - return SDValue();
> -
> - if (isAfterLegalize &&
> - !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
> - return SDValue();
> + // All undef vector. Return an UNDEF. All zero vectors were handled above.
> + if (NumNonZero == 0)
> + return DAG.getUNDEF(VT);
>
> - SDValue NewLd = SDValue();
> + // Special case for single non-zero, non-undef, element.
> + if (NumNonZero == 1) {
> + unsigned Idx = countTrailingZeros(NonZeros);
> + SDValue Item = Op.getOperand(Idx);
>
> - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
> - LDBase->getPointerInfo(), LDBase->isVolatile(),
> - LDBase->isNonTemporal(), LDBase->isInvariant(),
> - LDBase->getAlignment());
> + // If this is an insertion of an i64 value on x86-32, and if the top bits of
> + // the value are obviously zero, truncate the value to i32 and do the
> + // insertion that way. Only do this if the value is non-constant or if the
> + // value is a constant being inserted into element 0. It is cheaper to do
> + // a constant pool load than it is to do a movd + shuffle.
> + if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
> + (!IsAllConstants || Idx == 0)) {
> + if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
> + // Handle SSE only.
> + assert(VT == MVT::v2i64 && "Expected an SSE value type!");
> + EVT VecVT = MVT::v4i32;
>
> - if (LDBase->hasAnyUseOfValue(1)) {
> - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
> - SDValue(LDBase, 1),
> - SDValue(NewLd.getNode(), 1));
> - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
> - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
> - SDValue(NewLd.getNode(), 1));
> + // Truncate the value (which may itself be a constant) to i32, and
> + // convert it to a vector with movd (S2V+shuffle to zero extend).
> + Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
> + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
> + return DAG.getNode(
> + ISD::BITCAST, dl, VT,
> + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
> + }
> }
>
> - return NewLd;
> - }
> + // If we have a constant or non-constant insertion into the low element of
> + // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
> + // the rest of the elements. This will be matched as movd/movq/movss/movsd
> + // depending on what the source datatype is.
> + if (Idx == 0) {
> + if (NumZero == 0)
> + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
>
> - //TODO: The code below fires only for for loading the low v2i32 / v2f32
> - //of a v4i32 / v4f32. It's probably worth generalizing.
> - EVT EltVT = VT.getVectorElementType();
> - if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
> - DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
> - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
> - SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
> - SDValue ResNode =
> - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
> - LDBase->getPointerInfo(),
> - LDBase->getAlignment(),
> - false/*isVolatile*/, true/*ReadMem*/,
> - false/*WriteMem*/);
> + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
> + (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
> + if (VT.is256BitVector() || VT.is512BitVector()) {
> + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
> + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
> + Item, DAG.getIntPtrConstant(0));
> + }
> + assert(VT.is128BitVector() && "Expected an SSE value type!");
> + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
> + // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
> + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
> + }
>
> - // Make sure the newly-created LOAD is in the same position as LDBase in
> - // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
> - // update uses of LDBase's output chain to use the TokenFactor.
> - if (LDBase->hasAnyUseOfValue(1)) {
> - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
> - SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
> - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
> - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
> - SDValue(ResNode.getNode(), 1));
> + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
> + Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
> + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
> + if (VT.is256BitVector()) {
> + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
> + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
> + } else {
> + assert(VT.is128BitVector() && "Expected an SSE value type!");
> + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
> + }
> + return DAG.getNode(ISD::BITCAST, dl, VT, Item);
> + }
> }
>
> - return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
> + // Is it a vector logical left shift?
> + if (NumElems == 2 && Idx == 1 &&
> + X86::isZeroNode(Op.getOperand(0)) &&
> + !X86::isZeroNode(Op.getOperand(1))) {
> + unsigned NumBits = VT.getSizeInBits();
> + return getVShift(true, VT,
> + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
> + VT, Op.getOperand(1)),
> + NumBits/2, DAG, *this, dl);
> + }
> +
> + if (IsAllConstants) // Otherwise, it's better to do a constpool load.
> + return SDValue();
> +
> + // Otherwise, if this is a vector with i32 or f32 elements, and the element
> + // is a non-constant being inserted into an element other than the low one,
> + // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
> + // movd/movss) to move this into the low element, then shuffle it into
> + // place.
> + if (EVTBits == 32) {
> + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
> + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
> + }
> }
> - return SDValue();
> -}
>
> -/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
> -/// to generate a splat value for the following cases:
> -/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
> -/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
> -/// a scalar load, or a constant.
> -/// The VBROADCAST node is returned when a pattern is found,
> -/// or SDValue() otherwise.
> -static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
> - SelectionDAG &DAG) {
> - // VBROADCAST requires AVX.
> - // TODO: Splats could be generated for non-AVX CPUs using SSE
> - // instructions, but there's less potential gain for only 128-bit vectors.
> - if (!Subtarget->hasAVX())
> + // Splat is obviously ok. Let legalizer expand it to a shuffle.
> + if (Values.size() == 1) {
> + if (EVTBits == 32) {
> + // Instead of a shuffle like this:
> + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
> + // Check if it's possible to issue this instead.
> + // shuffle (vload ptr)), undef, <1, 1, 1, 1>
> + unsigned Idx = countTrailingZeros(NonZeros);
> + SDValue Item = Op.getOperand(Idx);
> + if (Op.getNode()->isOnlyUserOf(Item.getNode()))
> + return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
> + }
> return SDValue();
> + }
>
> - MVT VT = Op.getSimpleValueType();
> - SDLoc dl(Op);
> -
> - assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
> - "Unsupported vector type for broadcast.");
> + // A vector full of immediates; various special cases are already
> + // handled, so this is best done with a single constant-pool load.
> + if (IsAllConstants)
> + return SDValue();
>
> - SDValue Ld;
> - bool ConstSplatVal;
> + // For AVX-length vectors, see if we can use a vector load to get all of the
> + // elements, otherwise build the individual 128-bit pieces and use
> + // shuffles to put them in place.
> + if (VT.is256BitVector() || VT.is512BitVector()) {
> + SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
>
> - switch (Op.getOpcode()) {
> - default:
> - // Unknown pattern found.
> - return SDValue();
> + // Check for a build vector of consecutive loads.
> + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
> + return LD;
>
> - case ISD::BUILD_VECTOR: {
> - auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
> - BitVector UndefElements;
> - SDValue Splat = BVOp->getSplatValue(&UndefElements);
> + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
>
> - // We need a splat of a single value to use broadcast, and it doesn't
> - // make any sense if the value is only in one element of the vector.
> - if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
> - return SDValue();
> + // Build both the lower and upper subvector.
> + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
> + makeArrayRef(&V[0], NumElems/2));
> + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
> + makeArrayRef(&V[NumElems / 2], NumElems/2));
>
> - Ld = Splat;
> - ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
> - Ld.getOpcode() == ISD::ConstantFP);
> + // Recreate the wider vector with the lower and upper part.
> + if (VT.is256BitVector())
> + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
> + return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
> + }
>
> - // Make sure that all of the users of a non-constant load are from the
> - // BUILD_VECTOR node.
> - if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
> - return SDValue();
> - break;
> + // Let legalizer expand 2-wide build_vectors.
> + if (EVTBits == 64) {
> + if (NumNonZero == 1) {
> + // One half is zero or undef.
> + unsigned Idx = countTrailingZeros(NonZeros);
> + SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
> + Op.getOperand(Idx));
> + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
> }
> + return SDValue();
> + }
>
> - case ISD::VECTOR_SHUFFLE: {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + // If element VT is < 32 bits, convert it to inserts into a zero vector.
> + if (EVTBits == 8 && NumElems == 16) {
> + SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
> + Subtarget, *this);
> + if (V.getNode()) return V;
> + }
>
> - // Shuffles must have a splat mask where the first element is
> - // broadcasted.
> - if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
> - return SDValue();
> + if (EVTBits == 16 && NumElems == 8) {
> + SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
> + Subtarget, *this);
> + if (V.getNode()) return V;
> + }
>
> - SDValue Sc = Op.getOperand(0);
> - if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
> - Sc.getOpcode() != ISD::BUILD_VECTOR) {
> + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
> + if (EVTBits == 32 && NumElems == 4) {
> + SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
> + if (V.getNode())
> + return V;
> + }
>
> - if (!Subtarget->hasInt256())
> - return SDValue();
> + // If element VT is == 32 bits, turn it into a number of shuffles.
> + SmallVector<SDValue, 8> V(NumElems);
> + if (NumElems == 4 && NumZero > 0) {
> + for (unsigned i = 0; i < 4; ++i) {
> + bool isZero = !(NonZeros & (1 << i));
> + if (isZero)
> + V[i] = getZeroVector(VT, Subtarget, DAG, dl);
> + else
> + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
> + }
>
> - // Use the register form of the broadcast instruction available on AVX2.
> - if (VT.getSizeInBits() >= 256)
> - Sc = Extract128BitVector(Sc, 0, DAG, dl);
> - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
> + for (unsigned i = 0; i < 2; ++i) {
> + switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
> + default: break;
> + case 0:
> + V[i] = V[i*2]; // Must be a zero vector.
> + break;
> + case 1:
> + V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
> + break;
> + case 2:
> + V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
> + break;
> + case 3:
> + V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
> + break;
> }
> -
> - Ld = Sc.getOperand(0);
> - ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
> - Ld.getOpcode() == ISD::ConstantFP);
> -
> - // The scalar_to_vector node and the suspected
> - // load node must have exactly one user.
> - // Constants may have multiple users.
> -
> - // AVX-512 has register version of the broadcast
> - bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
> - Ld.getValueType().getSizeInBits() >= 32;
> - if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
> - !hasRegVer))
> - return SDValue();
> - break;
> }
> +
> + bool Reverse1 = (NonZeros & 0x3) == 2;
> + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
> + int MaskVec[] = {
> + Reverse1 ? 1 : 0,
> + Reverse1 ? 0 : 1,
> + static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
> + static_cast<int>(Reverse2 ? NumElems : NumElems+1)
> + };
> + return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
> }
>
> - unsigned ScalarSize = Ld.getValueType().getSizeInBits();
> - bool IsGE256 = (VT.getSizeInBits() >= 256);
> + if (Values.size() > 1 && VT.is128BitVector()) {
> + // Check for a build vector of consecutive loads.
> + for (unsigned i = 0; i < NumElems; ++i)
> + V[i] = Op.getOperand(i);
>
> - // When optimizing for size, generate up to 5 extra bytes for a broadcast
> - // instruction to save 8 or more bytes of constant pool data.
> - // TODO: If multiple splats are generated to load the same constant,
> - // it may be detrimental to overall size. There needs to be a way to detect
> - // that condition to know if this is truly a size win.
> - const Function *F = DAG.getMachineFunction().getFunction();
> - bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
> + // Check for elements which are consecutive loads.
> + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
> + if (LD.getNode())
> + return LD;
>
> - // Handle broadcasting a single constant scalar from the constant pool
> - // into a vector.
> - // On Sandybridge (no AVX2), it is still better to load a constant vector
> - // from the constant pool and not to broadcast it from a scalar.
> - // But override that restriction when optimizing for size.
> - // TODO: Check if splatting is recommended for other AVX-capable CPUs.
> - if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
> - EVT CVT = Ld.getValueType();
> - assert(!CVT.isVector() && "Must not broadcast a vector type");
> + // Check for a build vector from mostly shuffle plus few inserting.
> + SDValue Sh = buildFromShuffleMostly(Op, DAG);
> + if (Sh.getNode())
> + return Sh;
>
> - // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
> - // For size optimization, also splat v2f64 and v2i64, and for size opt
> - // with AVX2, also splat i8 and i16.
> - // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
> - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
> - (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
> - const Constant *C = nullptr;
> - if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
> - C = CI->getConstantIntValue();
> - else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
> - C = CF->getConstantFPValue();
> + // For SSE 4.1, use insertps to put the high elements into the low element.
> + if (Subtarget->hasSSE41()) {
> + SDValue Result;
> + if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
> + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
> + else
> + Result = DAG.getUNDEF(VT);
>
> - assert(C && "Invalid constant type");
> + for (unsigned i = 1; i < NumElems; ++i) {
> + if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
> + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
> + Op.getOperand(i), DAG.getIntPtrConstant(i));
> + }
> + return Result;
> + }
>
> - const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
> - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
> - Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
> - MachinePointerInfo::getConstantPool(),
> - false, false, false, Alignment);
> + // Otherwise, expand into a number of unpckl*, start by extending each of
> + // our (non-undef) elements to the full vector width with the element in the
> + // bottom slot of the vector (which generates no code for SSE).
> + for (unsigned i = 0; i < NumElems; ++i) {
> + if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
> + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
> + else
> + V[i] = DAG.getUNDEF(VT);
> + }
>
> - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> + // Next, we iteratively mix elements, e.g. for v4f32:
> + // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
> + // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
> + // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
> + unsigned EltStride = NumElems >> 1;
> + while (EltStride != 0) {
> + for (unsigned i = 0; i < EltStride; ++i) {
> + // If V[i+EltStride] is undef and this is the first round of mixing,
> + // then it is safe to just drop this shuffle: V[i] is already in the
> + // right place, the one element (since it's the first round) being
> + // inserted as undef can be dropped. This isn't safe for successive
> + // rounds because they will permute elements within both vectors.
> + if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
> + EltStride == NumElems/2)
> + continue;
> +
> + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
> + }
> + EltStride >>= 1;
> }
> + return V[0];
> }
> + return SDValue();
> +}
>
> - bool IsLoad = ISD::isNormalLoad(Ld.getNode());
> -
> - // Handle AVX2 in-register broadcasts.
> - if (!IsLoad && Subtarget->hasInt256() &&
> - (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
> - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> +// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
> +// to create 256-bit vectors from two other 128-bit ones.
> +static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
> + SDLoc dl(Op);
> + MVT ResVT = Op.getSimpleValueType();
>
> - // The scalar source must be a normal load.
> - if (!IsLoad)
> - return SDValue();
> + assert((ResVT.is256BitVector() ||
> + ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
>
> - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
> - (Subtarget->hasVLX() && ScalarSize == 64))
> - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> + SDValue V1 = Op.getOperand(0);
> + SDValue V2 = Op.getOperand(1);
> + unsigned NumElems = ResVT.getVectorNumElements();
> + if(ResVT.is256BitVector())
> + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
>
> - // The integer check is needed for the 64-bit into 128-bit so it doesn't match
> - // double since there is no vbroadcastsd xmm
> - if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
> - if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
> - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
> + if (Op.getNumOperands() == 4) {
> + MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
> + ResVT.getVectorNumElements()/2);
> + SDValue V3 = Op.getOperand(2);
> + SDValue V4 = Op.getOperand(3);
> + return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
> + Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
> }
> -
> - // Unsupported broadcast.
> - return SDValue();
> + return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
> }
>
> -/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
> -/// underlying vector and index.
> -///
> -/// Modifies \p ExtractedFromVec to the real vector and returns the real
> -/// index.
> -static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
> - SDValue ExtIdx) {
> - int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
> - if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
> - return Idx;
> +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
> + MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
> + assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
> + (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
> + Op.getNumOperands() == 4)));
>
> - // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
> - // lowered this:
> - // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
> - // to:
> - // (extract_vector_elt (vector_shuffle<2,u,u,u>
> - // (extract_subvector (v8f32 %vreg0), Constant<4>),
> - // undef)
> - // Constant<0>)
> - // In this case the vector is the extract_subvector expression and the index
> - // is 2, as specified by the shuffle.
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
> - SDValue ShuffleVec = SVOp->getOperand(0);
> - MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
> - assert(ShuffleVecVT.getVectorElementType() ==
> - ExtractedFromVec.getSimpleValueType().getVectorElementType());
> + // AVX can use the vinsertf128 instruction to create 256-bit vectors
> + // from two other 128-bit ones.
>
> - int ShuffleIdx = SVOp->getMaskElt(Idx);
> - if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
> - ExtractedFromVec = ShuffleVec;
> - return ShuffleIdx;
> - }
> - return Idx;
> + // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
> + return LowerAVXCONCAT_VECTORS(Op, DAG);
> }
>
> -static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
> - MVT VT = Op.getSimpleValueType();
>
> - // Skip if insert_vec_elt is not supported.
> - const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> - if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
> - return SDValue();
> +//===----------------------------------------------------------------------===//
> +// Vector shuffle lowering
> +//
> +// This is an experimental code path for lowering vector shuffles on x86. It is
> +// designed to handle arbitrary vector shuffles and blends, gracefully
> +// degrading performance as necessary. It works hard to recognize idiomatic
> +// shuffles and lower them to optimal instruction patterns without leaving
> +// a framework that allows reasonably efficient handling of all vector shuffle
> +// patterns.
> +//===----------------------------------------------------------------------===//
>
> - SDLoc DL(Op);
> - unsigned NumElems = Op.getNumOperands();
> +/// \brief Tiny helper function to identify a no-op mask.
> +///
> +/// This is a somewhat boring predicate function. It checks whether the mask
> +/// array input, which is assumed to be a single-input shuffle mask of the kind
> +/// used by the X86 shuffle instructions (not a fully general
> +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
> +/// in-place shuffle are 'no-op's.
> +static bool isNoopShuffleMask(ArrayRef<int> Mask) {
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (Mask[i] != -1 && Mask[i] != i)
> + return false;
> + return true;
> +}
>
> - SDValue VecIn1;
> - SDValue VecIn2;
> - SmallVector<unsigned, 4> InsertIndices;
> - SmallVector<int, 8> Mask(NumElems, -1);
> +/// \brief Helper function to classify a mask as a single-input mask.
> +///
> +/// This isn't a generic single-input test because in the vector shuffle
> +/// lowering we canonicalize single inputs to be the first input operand. This
> +/// means we can more quickly test for a single input by only checking whether
> +/// an input from the second operand exists. We also assume that the size of
> +/// mask corresponds to the size of the input vectors which isn't true in the
> +/// fully general case.
> +static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
> + for (int M : Mask)
> + if (M >= (int)Mask.size())
> + return false;
> + return true;
> +}
>
> - for (unsigned i = 0; i != NumElems; ++i) {
> - unsigned Opc = Op.getOperand(i).getOpcode();
> +/// \brief Test whether there are elements crossing 128-bit lanes in this
> +/// shuffle mask.
> +///
> +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
> +/// and we routinely test for these.
> +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
> + int LaneSize = 128 / VT.getScalarSizeInBits();
> + int Size = Mask.size();
> + for (int i = 0; i < Size; ++i)
> + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
> + return true;
> + return false;
> +}
>
> - if (Opc == ISD::UNDEF)
> +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
> +///
> +/// This checks a shuffle mask to see if it is performing the same
> +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
> +/// that it is also not lane-crossing. It may however involve a blend from the
> +/// same lane of a second vector.
> +///
> +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
> +/// non-trivial to compute in the face of undef lanes. The representation is
> +/// *not* suitable for use with existing 128-bit shuffles as it will contain
> +/// entries from both V1 and V2 inputs to the wider mask.
> +static bool
> +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
> + SmallVectorImpl<int> &RepeatedMask) {
> + int LaneSize = 128 / VT.getScalarSizeInBits();
> + RepeatedMask.resize(LaneSize, -1);
> + int Size = Mask.size();
> + for (int i = 0; i < Size; ++i) {
> + if (Mask[i] < 0)
> continue;
> + if ((Mask[i] % Size) / LaneSize != i / LaneSize)
> + // This entry crosses lanes, so there is no way to model this shuffle.
> + return false;
>
> - if (Opc != ISD::EXTRACT_VECTOR_ELT) {
> - // Quit if more than 1 elements need inserting.
> - if (InsertIndices.size() > 1)
> - return SDValue();
> -
> - InsertIndices.push_back(i);
> - continue;
> - }
> + // Ok, handle the in-lane shuffles by detecting if and when they repeat.
> + if (RepeatedMask[i % LaneSize] == -1)
> + // This is the first non-undef entry in this slot of a 128-bit lane.
> + RepeatedMask[i % LaneSize] =
> + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
> + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
> + // Found a mismatch with the repeated mask.
> + return false;
> + }
> + return true;
> +}
>
> - SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
> - SDValue ExtIdx = Op.getOperand(i).getOperand(1);
> - // Quit if non-constant index.
> - if (!isa<ConstantSDNode>(ExtIdx))
> - return SDValue();
> - int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
> -
> - // Quit if extracted from vector of different type.
> - if (ExtractedFromVec.getValueType() != VT)
> - return SDValue();
> +/// \brief Base case helper for testing a single mask element.
> +static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
> + BuildVectorSDNode *BV1,
> + BuildVectorSDNode *BV2, ArrayRef<int> Mask,
> + int i, int Arg) {
> + int Size = Mask.size();
> + if (Mask[i] != -1 && Mask[i] != Arg) {
> + auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
> + auto *ArgsBV = Arg < Size ? BV1 : BV2;
> + if (!MaskBV || !ArgsBV ||
> + MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
> + return false;
> + }
> + return true;
> +}
>
> - if (!VecIn1.getNode())
> - VecIn1 = ExtractedFromVec;
> - else if (VecIn1 != ExtractedFromVec) {
> - if (!VecIn2.getNode())
> - VecIn2 = ExtractedFromVec;
> - else if (VecIn2 != ExtractedFromVec)
> - // Quit if more than 2 vectors to shuffle
> - return SDValue();
> - }
> +/// \brief Recursive helper to peel off and test each mask element.
> +template <typename... Ts>
> +static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
> + BuildVectorSDNode *BV1,
> + BuildVectorSDNode *BV2, ArrayRef<int> Mask,
> + int i, int Arg, Ts... Args) {
> + if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
> + return false;
>
> - if (ExtractedFromVec == VecIn1)
> - Mask[i] = Idx;
> - else if (ExtractedFromVec == VecIn2)
> - Mask[i] = Idx + NumElems;
> - }
> + return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
> +}
>
> - if (!VecIn1.getNode())
> - return SDValue();
> +/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
> +/// arguments.
> +///
> +/// This is a fast way to test a shuffle mask against a fixed pattern:
> +///
> +/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
> +///
> +/// It returns true if the mask is exactly as wide as the argument list, and
> +/// each element of the mask is either -1 (signifying undef) or the value given
> +/// in the argument.
> +template <typename... Ts>
> +static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
> + Ts... Args) {
> + if (Mask.size() != sizeof...(Args))
> + return false;
>
> - VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
> - SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
> - for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
> - unsigned Idx = InsertIndices[i];
> - NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
> - DAG.getIntPtrConstant(Idx));
> - }
> + // If the values are build vectors, we can look through them to find
> + // equivalent inputs that make the shuffles equivalent.
> + auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
> + auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
>
> - return NV;
> + // Recursively peel off arguments and test them against the mask.
> + return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
> }
>
> -// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
> -SDValue
> -X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
> +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
> +///
> +/// This helper function produces an 8-bit shuffle immediate corresponding to
> +/// the ubiquitous shuffle encoding scheme used in x86 instructions for
> +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
> +/// example.
> +///
> +/// NB: We rely heavily on "undef" masks preserving the input lane.
> +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
> + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
> + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
> + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
> + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
>
> - MVT VT = Op.getSimpleValueType();
> - assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
> - "Unexpected type in LowerBUILD_VECTORvXi1!");
> + unsigned Imm = 0;
> + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
> + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
> + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
> + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
> + return DAG.getConstant(Imm, MVT::i8);
> +}
>
> - SDLoc dl(Op);
> - if (ISD::isBuildVectorAllZeros(Op.getNode())) {
> - SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
> - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> +/// \brief Try to emit a blend instruction for a shuffle using bit math.
> +///
> +/// This is used as a fallback approach when first class blend instructions are
> +/// unavailable. Currently it is only suitable for integer vectors, but could
> +/// be generalized for floating point vectors if desirable.
> +static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + assert(VT.isInteger() && "Only supports integer vector types!");
> + MVT EltVT = VT.getScalarType();
> + int NumEltBits = EltVT.getSizeInBits();
> + SDValue Zero = DAG.getConstant(0, EltVT);
> + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
> + SmallVector<SDValue, 16> MaskOps;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> + if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
> + return SDValue(); // Shuffled input!
> + MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
> }
>
> - if (ISD::isBuildVectorAllOnes(Op.getNode())) {
> - SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
> - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
> - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
> - }
> + SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
> + V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
> + // We have to cast V2 around.
> + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
> + V2 = DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
> + DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
> + DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
> + return DAG.getNode(ISD::OR, DL, VT, V1, V2);
> +}
>
> - bool AllContants = true;
> - uint64_t Immediate = 0;
> - int NonConstIdx = -1;
> - bool IsSplat = true;
> - unsigned NumNonConsts = 0;
> - unsigned NumConsts = 0;
> - for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
> - SDValue In = Op.getOperand(idx);
> - if (In.getOpcode() == ISD::UNDEF)
> +/// \brief Try to emit a blend instruction for a shuffle.
> +///
> +/// This doesn't do any checks for the availability of instructions for blending
> +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
> +/// be matched in the backend with the type given. What it does check for is
> +/// that the shuffle mask is in fact a blend.
> +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + unsigned BlendMask = 0;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> + if (Mask[i] >= Size) {
> + if (Mask[i] != i + Size)
> + return SDValue(); // Shuffled V2 input!
> + BlendMask |= 1u << i;
> continue;
> - if (!isa<ConstantSDNode>(In)) {
> - AllContants = false;
> - NonConstIdx = idx;
> - NumNonConsts++;
> - } else {
> - NumConsts++;
> - if (cast<ConstantSDNode>(In)->getZExtValue())
> - Immediate |= (1ULL << idx);
> }
> - if (In != Op.getOperand(0))
> - IsSplat = false;
> + if (Mask[i] >= 0 && Mask[i] != i)
> + return SDValue(); // Shuffled V1 input!
> }
> + switch (VT.SimpleTy) {
> + case MVT::v2f64:
> + case MVT::v4f32:
> + case MVT::v4f64:
> + case MVT::v8f32:
> + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
> + DAG.getConstant(BlendMask, MVT::i8));
>
> - if (AllContants) {
> - SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
> - DAG.getConstant(Immediate, MVT::i16));
> - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
> - DAG.getIntPtrConstant(0));
> + case MVT::v4i64:
> + case MVT::v8i32:
> + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
> + // FALLTHROUGH
> + case MVT::v2i64:
> + case MVT::v4i32:
> + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
> + // that instruction.
> + if (Subtarget->hasAVX2()) {
> + // Scale the blend by the number of 32-bit dwords per element.
> + int Scale = VT.getScalarSizeInBits() / 32;
> + BlendMask = 0;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (Mask[i] >= Size)
> + for (int j = 0; j < Scale; ++j)
> + BlendMask |= 1u << (i * Scale + j);
> +
> + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
> + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
> + DAG.getConstant(BlendMask, MVT::i8)));
> + }
> + // FALLTHROUGH
> + case MVT::v8i16: {
> + // For integer shuffles we need to expand the mask and cast the inputs to
> + // v8i16s prior to blending.
> + int Scale = 8 / VT.getVectorNumElements();
> + BlendMask = 0;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (Mask[i] >= Size)
> + for (int j = 0; j < Scale; ++j)
> + BlendMask |= 1u << (i * Scale + j);
> +
> + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
> + DAG.getConstant(BlendMask, MVT::i8)));
> }
>
> - if (NumNonConsts == 1 && NonConstIdx != 0) {
> - SDValue DstVec;
> - if (NumConsts) {
> - SDValue VecAsImm = DAG.getConstant(Immediate,
> - MVT::getIntegerVT(VT.getSizeInBits()));
> - DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
> + case MVT::v16i16: {
> + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
> + SmallVector<int, 8> RepeatedMask;
> + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
> + // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
> + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
> + BlendMask = 0;
> + for (int i = 0; i < 8; ++i)
> + if (RepeatedMask[i] >= 16)
> + BlendMask |= 1u << i;
> + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
> + DAG.getConstant(BlendMask, MVT::i8));
> }
> - else
> - DstVec = DAG.getUNDEF(VT);
> - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
> - Op.getOperand(NonConstIdx),
> - DAG.getIntPtrConstant(NonConstIdx));
> }
> - if (!IsSplat && (NonConstIdx != 0))
> - llvm_unreachable("Unsupported BUILD_VECTOR operation");
> - MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
> - SDValue Select;
> - if (IsSplat)
> - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
> - DAG.getConstant(-1, SelectVT),
> - DAG.getConstant(0, SelectVT));
> - else
> - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
> - DAG.getConstant((Immediate | 1), SelectVT),
> - DAG.getConstant(Immediate, SelectVT));
> - return DAG.getNode(ISD::BITCAST, dl, VT, Select);
> -}
> + // FALLTHROUGH
> + case MVT::v16i8:
> + case MVT::v32i8: {
> + // Scale the blend by the number of bytes per element.
> + int Scale = VT.getScalarSizeInBits() / 8;
>
> -/// \brief Return true if \p N implements a horizontal binop and return the
> -/// operands for the horizontal binop into V0 and V1.
> -///
> -/// This is a helper function of PerformBUILD_VECTORCombine.
> -/// This function checks that the build_vector \p N in input implements a
> -/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
> -/// operation to match.
> -/// For example, if \p Opcode is equal to ISD::ADD, then this function
> -/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
> -/// is equal to ISD::SUB, then this function checks if this is a horizontal
> -/// arithmetic sub.
> -///
> -/// This function only analyzes elements of \p N whose indices are
> -/// in range [BaseIdx, LastIdx).
> -static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
> - SelectionDAG &DAG,
> - unsigned BaseIdx, unsigned LastIdx,
> - SDValue &V0, SDValue &V1) {
> - EVT VT = N->getValueType(0);
> + // This form of blend is always done on bytes. Compute the byte vector
> + // type.
> + MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
>
> - assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
> - assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
> - "Invalid Vector in input!");
> + // Compute the VSELECT mask. Note that VSELECT is really confusing in the
> + // mix of LLVM's code generator and the x86 backend. We tell the code
> + // generator that boolean values in the elements of an x86 vector register
> + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
> + // mapping a select to operand #1, and 'false' mapping to operand #2. The
> + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
> + // of the element (the remaining are ignored) and 0 in that high bit would
> + // mean operand #1 while 1 in the high bit would mean operand #2. So while
> + // the LLVM model for boolean values in vector elements gets the relevant
> + // bit set, it is set backwards and over constrained relative to x86's
> + // actual model.
> + SmallVector<SDValue, 32> VSELECTMask;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + for (int j = 0; j < Scale; ++j)
> + VSELECTMask.push_back(
> + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
> + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
>
> - bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
> - bool CanFold = true;
> - unsigned ExpectedVExtractIdx = BaseIdx;
> - unsigned NumElts = LastIdx - BaseIdx;
> - V0 = DAG.getUNDEF(VT);
> - V1 = DAG.getUNDEF(VT);
> + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
> + return DAG.getNode(
> + ISD::BITCAST, DL, VT,
> + DAG.getNode(ISD::VSELECT, DL, BlendVT,
> + DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
> + V1, V2));
> + }
>
> - // Check if N implements a horizontal binop.
> - for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
> - SDValue Op = N->getOperand(i + BaseIdx);
> + default:
> + llvm_unreachable("Not a supported integer vector type!");
> + }
> +}
>
> - // Skip UNDEFs.
> - if (Op->getOpcode() == ISD::UNDEF) {
> - // Update the expected vector extract index.
> - if (i * 2 == NumElts)
> - ExpectedVExtractIdx = BaseIdx;
> - ExpectedVExtractIdx += 2;
> - continue;
> - }
> +/// \brief Try to lower as a blend of elements from two inputs followed by
> +/// a single-input permutation.
> +///
> +/// This matches the pattern where we can blend elements from two inputs and
> +/// then reduce the shuffle to a single-input permutation.
> +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2,
> + ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + // We build up the blend mask while checking whether a blend is a viable way
> + // to reduce the shuffle.
> + SmallVector<int, 32> BlendMask(Mask.size(), -1);
> + SmallVector<int, 32> PermuteMask(Mask.size(), -1);
>
> - CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
> + for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> + if (Mask[i] < 0)
> + continue;
>
> - if (!CanFold)
> - break;
> + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
>
> - SDValue Op0 = Op.getOperand(0);
> - SDValue Op1 = Op.getOperand(1);
> + if (BlendMask[Mask[i] % Size] == -1)
> + BlendMask[Mask[i] % Size] = Mask[i];
> + else if (BlendMask[Mask[i] % Size] != Mask[i])
> + return SDValue(); // Can't blend in the needed input!
>
> - // Try to match the following pattern:
> - // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
> - CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
> - Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
> - Op0.getOperand(0) == Op1.getOperand(0) &&
> - isa<ConstantSDNode>(Op0.getOperand(1)) &&
> - isa<ConstantSDNode>(Op1.getOperand(1)));
> - if (!CanFold)
> - break;
> + PermuteMask[i] = Mask[i] % Size;
> + }
>
> - unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
> - unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
> + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
> + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
> +}
>
> - if (i * 2 < NumElts) {
> - if (V0.getOpcode() == ISD::UNDEF)
> - V0 = Op0.getOperand(0);
> - } else {
> - if (V1.getOpcode() == ISD::UNDEF)
> - V1 = Op0.getOperand(0);
> - if (i * 2 == NumElts)
> - ExpectedVExtractIdx = BaseIdx;
> +/// \brief Generic routine to decompose a shuffle and blend into indepndent
> +/// blends and permutes.
> +///
> +/// This matches the extremely common pattern for handling combined
> +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
> +/// operations. It will try to pick the best arrangement of shuffles and
> +/// blends.
> +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
> + SDValue V1,
> + SDValue V2,
> + ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + // Shuffle the input elements into the desired positions in V1 and V2 and
> + // blend them together.
> + SmallVector<int, 32> V1Mask(Mask.size(), -1);
> + SmallVector<int, 32> V2Mask(Mask.size(), -1);
> + SmallVector<int, 32> BlendMask(Mask.size(), -1);
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (Mask[i] >= 0 && Mask[i] < Size) {
> + V1Mask[i] = Mask[i];
> + BlendMask[i] = i;
> + } else if (Mask[i] >= Size) {
> + V2Mask[i] = Mask[i] - Size;
> + BlendMask[i] = i + Size;
> }
>
> - SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
> - if (I0 == ExpectedVExtractIdx)
> - CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
> - else if (IsCommutable && I1 == ExpectedVExtractIdx) {
> - // Try to match the following dag sequence:
> - // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
> - CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
> - } else
> - CanFold = false;
> -
> - ExpectedVExtractIdx += 2;
> - }
> + // Try to lower with the simpler initial blend strategy unless one of the
> + // input shuffles would be a no-op. We prefer to shuffle inputs as the
> + // shuffle may be able to fold with a load or other benefit. However, when
> + // we'll have to do 2x as many shuffles in order to achieve this, blending
> + // first is a better strategy.
> + if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
> + if (SDValue BlendPerm =
> + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
> + return BlendPerm;
>
> - return CanFold;
> + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
> + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
> + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
> }
>
> -/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
> -/// a concat_vector.
> -///
> -/// This is a helper function of PerformBUILD_VECTORCombine.
> -/// This function expects two 256-bit vectors called V0 and V1.
> -/// At first, each vector is split into two separate 128-bit vectors.
> -/// Then, the resulting 128-bit vectors are used to implement two
> -/// horizontal binary operations.
> -///
> -/// The kind of horizontal binary operation is defined by \p X86Opcode.
> +/// \brief Try to lower a vector shuffle as a byte rotation.
> ///
> -/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
> -/// the two new horizontal binop.
> -/// When Mode is set, the first horizontal binop dag node would take as input
> -/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
> -/// horizontal binop dag node would take as input the lower 128-bit of V1
> -/// and the upper 128-bit of V1.
> -/// Example:
> -/// HADD V0_LO, V0_HI
> -/// HADD V1_LO, V1_HI
> +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
> +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
> +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
> +/// try to generically lower a vector shuffle through such an pattern. It
> +/// does not check for the profitability of lowering either as PALIGNR or
> +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
> +/// This matches shuffle vectors that look like:
> ///
> -/// Otherwise, the first horizontal binop dag node takes as input the lower
> -/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
> -/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
> -/// Example:
> -/// HADD V0_LO, V1_LO
> -/// HADD V0_HI, V1_HI
> +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
> ///
> -/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
> -/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
> -/// the upper 128-bits of the result.
> -static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
> - SDLoc DL, SelectionDAG &DAG,
> - unsigned X86Opcode, bool Mode,
> - bool isUndefLO, bool isUndefHI) {
> - EVT VT = V0.getValueType();
> - assert(VT.is256BitVector() && VT == V1.getValueType() &&
> - "Invalid nodes in input!");
> -
> - unsigned NumElts = VT.getVectorNumElements();
> - SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
> - SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
> - SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
> - SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
> - EVT NewVT = V0_LO.getValueType();
> +/// Essentially it concatenates V1 and V2, shifts right by some number of
> +/// elements, and takes the low elements as the result. Note that while this is
> +/// specified as a *right shift* because x86 is little-endian, it is a *left
> +/// rotate* of the vector lanes.
> +static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2,
> + ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
>
> - SDValue LO = DAG.getUNDEF(NewVT);
> - SDValue HI = DAG.getUNDEF(NewVT);
> + int NumElts = Mask.size();
> + int NumLanes = VT.getSizeInBits() / 128;
> + int NumLaneElts = NumElts / NumLanes;
>
> - if (Mode) {
> - // Don't emit a horizontal binop if the result is expected to be UNDEF.
> - if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
> - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
> - if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
> - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
> - } else {
> - // Don't emit a horizontal binop if the result is expected to be UNDEF.
> - if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
> - V1_LO->getOpcode() != ISD::UNDEF))
> - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
> + // We need to detect various ways of spelling a rotation:
> + // [11, 12, 13, 14, 15, 0, 1, 2]
> + // [-1, 12, 13, 14, -1, -1, 1, -1]
> + // [-1, -1, -1, -1, -1, -1, 1, 2]
> + // [ 3, 4, 5, 6, 7, 8, 9, 10]
> + // [-1, 4, 5, 6, -1, -1, 9, -1]
> + // [-1, 4, 5, 6, -1, -1, -1, -1]
> + int Rotation = 0;
> + SDValue Lo, Hi;
> + for (int l = 0; l < NumElts; l += NumLaneElts) {
> + for (int i = 0; i < NumLaneElts; ++i) {
> + if (Mask[l + i] == -1)
> + continue;
> + assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
>
> - if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
> - V1_HI->getOpcode() != ISD::UNDEF))
> - HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
> - }
> + // Get the mod-Size index and lane correct it.
> + int LaneIdx = (Mask[l + i] % NumElts) - l;
> + // Make sure it was in this lane.
> + if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
> + return SDValue();
>
> - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
> -}
> + // Determine where a rotated vector would have started.
> + int StartIdx = i - LaneIdx;
> + if (StartIdx == 0)
> + // The identity rotation isn't interesting, stop.
> + return SDValue();
>
> -/// \brief Try to fold a build_vector that performs an 'addsub' into the
> -/// sequence of 'vadd + vsub + blendi'.
> -static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
> - const X86Subtarget *Subtarget) {
> - SDLoc DL(BV);
> - EVT VT = BV->getValueType(0);
> - unsigned NumElts = VT.getVectorNumElements();
> - SDValue InVec0 = DAG.getUNDEF(VT);
> - SDValue InVec1 = DAG.getUNDEF(VT);
> + // If we found the tail of a vector the rotation must be the missing
> + // front. If we found the head of a vector, it must be how much of the
> + // head.
> + int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
>
> - assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
> - VT == MVT::v2f64) && "build_vector with an invalid type found!");
> + if (Rotation == 0)
> + Rotation = CandidateRotation;
> + else if (Rotation != CandidateRotation)
> + // The rotations don't match, so we can't match this mask.
> + return SDValue();
>
> - // Odd-numbered elements in the input build vector are obtained from
> - // adding two integer/float elements.
> - // Even-numbered elements in the input build vector are obtained from
> - // subtracting two integer/float elements.
> - unsigned ExpectedOpcode = ISD::FSUB;
> - unsigned NextExpectedOpcode = ISD::FADD;
> - bool AddFound = false;
> - bool SubFound = false;
> + // Compute which value this mask is pointing at.
> + SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
>
> - for (unsigned i = 0, e = NumElts; i != e; ++i) {
> - SDValue Op = BV->getOperand(i);
> + // Compute which of the two target values this index should be assigned
> + // to. This reflects whether the high elements are remaining or the low
> + // elements are remaining.
> + SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
>
> - // Skip 'undef' values.
> - unsigned Opcode = Op.getOpcode();
> - if (Opcode == ISD::UNDEF) {
> - std::swap(ExpectedOpcode, NextExpectedOpcode);
> - continue;
> + // Either set up this value if we've not encountered it before, or check
> + // that it remains consistent.
> + if (!TargetV)
> + TargetV = MaskV;
> + else if (TargetV != MaskV)
> + // This may be a rotation, but it pulls from the inputs in some
> + // unsupported interleaving.
> + return SDValue();
> }
> + }
>
> - // Early exit if we found an unexpected opcode.
> - if (Opcode != ExpectedOpcode)
> - return SDValue();
> + // Check that we successfully analyzed the mask, and normalize the results.
> + assert(Rotation != 0 && "Failed to locate a viable rotation!");
> + assert((Lo || Hi) && "Failed to find a rotated input vector!");
> + if (!Lo)
> + Lo = Hi;
> + else if (!Hi)
> + Hi = Lo;
>
> - SDValue Op0 = Op.getOperand(0);
> - SDValue Op1 = Op.getOperand(1);
> + // The actual rotate instruction rotates bytes, so we need to scale the
> + // rotation based on how many bytes are in the vector lane.
> + int Scale = 16 / NumLaneElts;
>
> - // Try to match the following pattern:
> - // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
> - // Early exit if we cannot match that sequence.
> - if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> - Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
> - !isa<ConstantSDNode>(Op0.getOperand(1)) ||
> - !isa<ConstantSDNode>(Op1.getOperand(1)) ||
> - Op0.getOperand(1) != Op1.getOperand(1))
> - return SDValue();
> + // SSSE3 targets can use the palignr instruction.
> + if (Subtarget->hasSSSE3()) {
> + // Cast the inputs to i8 vector of correct length to match PALIGNR.
> + MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
> + Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
> + Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
>
> - unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
> - if (I0 != i)
> - return SDValue();
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
> + DAG.getConstant(Rotation * Scale, MVT::i8)));
> + }
>
> - // We found a valid add/sub node. Update the information accordingly.
> - if (i & 1)
> - AddFound = true;
> - else
> - SubFound = true;
> + assert(VT.getSizeInBits() == 128 &&
> + "Rotate-based lowering only supports 128-bit lowering!");
> + assert(Mask.size() <= 16 &&
> + "Can shuffle at most 16 bytes in a 128-bit vector!");
>
> - // Update InVec0 and InVec1.
> - if (InVec0.getOpcode() == ISD::UNDEF)
> - InVec0 = Op0.getOperand(0);
> - if (InVec1.getOpcode() == ISD::UNDEF)
> - InVec1 = Op1.getOperand(0);
> + // Default SSE2 implementation
> + int LoByteShift = 16 - Rotation * Scale;
> + int HiByteShift = Rotation * Scale;
>
> - // Make sure that operands in input to each add/sub node always
> - // come from a same pair of vectors.
> - if (InVec0 != Op0.getOperand(0)) {
> - if (ExpectedOpcode == ISD::FSUB)
> - return SDValue();
> + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
> + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
> + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
>
> - // FADD is commutable. Try to commute the operands
> - // and then test again.
> - std::swap(Op0, Op1);
> - if (InVec0 != Op0.getOperand(0))
> - return SDValue();
> + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
> + DAG.getConstant(LoByteShift, MVT::i8));
> + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
> + DAG.getConstant(HiByteShift, MVT::i8));
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
> +}
> +
> +/// \brief Compute whether each element of a shuffle is zeroable.
> +///
> +/// A "zeroable" vector shuffle element is one which can be lowered to zero.
> +/// Either it is an undef element in the shuffle mask, the element of the input
> +/// referenced is undef, or the element of the input referenced is known to be
> +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
> +/// as many lanes with this technique as possible to simplify the remaining
> +/// shuffle.
> +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
> + SDValue V1, SDValue V2) {
> + SmallBitVector Zeroable(Mask.size(), false);
> +
> + while (V1.getOpcode() == ISD::BITCAST)
> + V1 = V1->getOperand(0);
> + while (V2.getOpcode() == ISD::BITCAST)
> + V2 = V2->getOperand(0);
> +
> + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
> + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
> +
> + for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> + int M = Mask[i];
> + // Handle the easy cases.
> + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
> + Zeroable[i] = true;
> + continue;
> }
>
> - if (InVec1 != Op1.getOperand(0))
> - return SDValue();
> + // If this is an index into a build_vector node (which has the same number
> + // of elements), dig out the input value and use it.
> + SDValue V = M < Size ? V1 : V2;
> + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
> + continue;
>
> - // Update the pair of expected opcodes.
> - std::swap(ExpectedOpcode, NextExpectedOpcode);
> + SDValue Input = V.getOperand(M % Size);
> + // The UNDEF opcode check really should be dead code here, but not quite
> + // worth asserting on (it isn't invalid, just unexpected).
> + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
> + Zeroable[i] = true;
> }
>
> - // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
> - if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
> - InVec1.getOpcode() != ISD::UNDEF)
> - return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
> -
> - return SDValue();
> + return Zeroable;
> }
>
> -static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
> - const X86Subtarget *Subtarget) {
> - SDLoc DL(N);
> - EVT VT = N->getValueType(0);
> - unsigned NumElts = VT.getVectorNumElements();
> - BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
> - SDValue InVec0, InVec1;
> +/// \brief Try to emit a bitmask instruction for a shuffle.
> +///
> +/// This handles cases where we can model a blend exactly as a bitmask due to
> +/// one of the inputs being zeroable.
> +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + MVT EltVT = VT.getScalarType();
> + int NumEltBits = EltVT.getSizeInBits();
> + MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
> + SDValue Zero = DAG.getConstant(0, IntEltVT);
> + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
> + if (EltVT.isFloatingPoint()) {
> + Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
> + AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
> + }
> + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> + SDValue V;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> + if (Zeroable[i])
> + continue;
> + if (Mask[i] % Size != i)
> + return SDValue(); // Not a blend.
> + if (!V)
> + V = Mask[i] < Size ? V1 : V2;
> + else if (V != (Mask[i] < Size ? V1 : V2))
> + return SDValue(); // Can only let one input through the mask.
>
> - // Try to match an ADDSUB.
> - if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
> - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
> - SDValue Value = matchAddSub(BV, DAG, Subtarget);
> - if (Value.getNode())
> - return Value;
> + VMaskOps[i] = AllOnes;
> }
> + if (!V)
> + return SDValue(); // No non-zeroable elements!
>
> - // Try to match horizontal ADD/SUB.
> - unsigned NumUndefsLO = 0;
> - unsigned NumUndefsHI = 0;
> - unsigned Half = NumElts/2;
> -
> - // Count the number of UNDEF operands in the build_vector in input.
> - for (unsigned i = 0, e = Half; i != e; ++i)
> - if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
> - NumUndefsLO++;
> -
> - for (unsigned i = Half, e = NumElts; i != e; ++i)
> - if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
> - NumUndefsHI++;
> -
> - // Early exit if this is either a build_vector of all UNDEFs or all the
> - // operands but one are UNDEF.
> - if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
> - return SDValue();
> -
> - if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
> - // Try to match an SSE3 float HADD/HSUB.
> - if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
> - return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
> -
> - if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
> - return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
> - } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
> - // Try to match an SSSE3 integer HADD/HSUB.
> - if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
> - return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
> -
> - if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
> - return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
> - }
> -
> - if (!Subtarget->hasAVX())
> - return SDValue();
> + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
> + V = DAG.getNode(VT.isFloatingPoint()
> + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
> + DL, VT, V, VMask);
> + return V;
> +}
>
> - if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
> - // Try to match an AVX horizontal add/sub of packed single/double
> - // precision floating point values from 256-bit vectors.
> - SDValue InVec2, InVec3;
> - if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
> - isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
> - ((InVec0.getOpcode() == ISD::UNDEF ||
> - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> - ((InVec1.getOpcode() == ISD::UNDEF ||
> - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> - return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
> +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
> +///
> +/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
> +/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
> +/// matches elements from one of the input vectors shuffled to the left or
> +/// right with zeroable elements 'shifted in'. It handles both the strictly
> +/// bit-wise element shifts and the byte shift across an entire 128-bit double
> +/// quad word lane.
> +///
> +/// PSHL : (little-endian) left bit shift.
> +/// [ zz, 0, zz, 2 ]
> +/// [ -1, 4, zz, -1 ]
> +/// PSRL : (little-endian) right bit shift.
> +/// [ 1, zz, 3, zz]
> +/// [ -1, -1, 7, zz]
> +/// PSLLDQ : (little-endian) left byte shift
> +/// [ zz, 0, 1, 2, 3, 4, 5, 6]
> +/// [ zz, zz, -1, -1, 2, 3, 4, -1]
> +/// [ zz, zz, zz, zz, zz, zz, -1, 1]
> +/// PSRLDQ : (little-endian) right byte shift
> +/// [ 5, 6, 7, zz, zz, zz, zz, zz]
> +/// [ -1, 5, 6, 7, zz, zz, zz, zz]
> +/// [ 1, 2, -1, -1, -1, -1, zz, zz]
> +static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
>
> - if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
> - isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
> - ((InVec0.getOpcode() == ISD::UNDEF ||
> - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> - ((InVec1.getOpcode() == ISD::UNDEF ||
> - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> - return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
> - } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
> - // Try to match an AVX2 horizontal add/sub of signed integers.
> - SDValue InVec2, InVec3;
> - unsigned X86Opcode;
> - bool CanFold = true;
> + int Size = Mask.size();
> + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
>
> - if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
> - isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
> - ((InVec0.getOpcode() == ISD::UNDEF ||
> - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> - ((InVec1.getOpcode() == ISD::UNDEF ||
> - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> - X86Opcode = X86ISD::HADD;
> - else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
> - isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
> - ((InVec0.getOpcode() == ISD::UNDEF ||
> - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
> - ((InVec1.getOpcode() == ISD::UNDEF ||
> - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
> - X86Opcode = X86ISD::HSUB;
> - else
> - CanFold = false;
> + auto CheckZeros = [&](int Shift, int Scale, bool Left) {
> + for (int i = 0; i < Size; i += Scale)
> + for (int j = 0; j < Shift; ++j)
> + if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
> + return false;
>
> - if (CanFold) {
> - // Fold this build_vector into a single horizontal add/sub.
> - // Do this only if the target has AVX2.
> - if (Subtarget->hasAVX2())
> - return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
> + return true;
> + };
>
> - // Do not try to expand this build_vector into a pair of horizontal
> - // add/sub if we can emit a pair of scalar add/sub.
> - if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
> + auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
> + for (int i = 0; i != Size; i += Scale) {
> + unsigned Pos = Left ? i + Shift : i;
> + unsigned Low = Left ? i : i + Shift;
> + unsigned Len = Scale - Shift;
> + if (!isSequentialOrUndefInRange(Mask, Pos, Len,
> + Low + (V == V1 ? 0 : Size)))
> return SDValue();
> -
> - // Convert this build_vector into a pair of horizontal binop followed by
> - // a concat vector.
> - bool isUndefLO = NumUndefsLO == Half;
> - bool isUndefHI = NumUndefsHI == Half;
> - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
> - isUndefLO, isUndefHI);
> }
> - }
> -
> - if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
> - VT == MVT::v16i16) && Subtarget->hasAVX()) {
> - unsigned X86Opcode;
> - if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
> - X86Opcode = X86ISD::HADD;
> - else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
> - X86Opcode = X86ISD::HSUB;
> - else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
> - X86Opcode = X86ISD::FHADD;
> - else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
> - X86Opcode = X86ISD::FHSUB;
> - else
> - return SDValue();
>
> - // Don't try to expand this build_vector into a pair of horizontal add/sub
> - // if we can simply emit a pair of scalar add/sub.
> - if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
> - return SDValue();
> + int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
> + bool ByteShift = ShiftEltBits > 64;
> + unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
> + : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
> + int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
>
> - // Convert this build_vector into two horizontal add/sub followed by
> - // a concat vector.
> - bool isUndefLO = NumUndefsLO == Half;
> - bool isUndefHI = NumUndefsHI == Half;
> - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
> - isUndefLO, isUndefHI);
> - }
> + // Normalize the scale for byte shifts to still produce an i64 element
> + // type.
> + Scale = ByteShift ? Scale / 2 : Scale;
>
> - return SDValue();
> -}
> + // We need to round trip through the appropriate type for the shift.
> + MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
> + MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
> + assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
> + "Illegal integer vector type");
> + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
>
> -SDValue
> -X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
> - SDLoc dl(Op);
> + V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
> + return DAG.getNode(ISD::BITCAST, DL, VT, V);
> + };
>
> - MVT VT = Op.getSimpleValueType();
> - MVT ExtVT = VT.getVectorElementType();
> - unsigned NumElems = Op.getNumOperands();
> + // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
> + // keep doubling the size of the integer elements up to that. We can
> + // then shift the elements of the integer vector by whole multiples of
> + // their width within the elements of the larger integer vector. Test each
> + // multiple to see if we can find a match with the moved element indices
> + // and that the shifted in elements are all zeroable.
> + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
> + for (int Shift = 1; Shift != Scale; ++Shift)
> + for (bool Left : {true, false})
> + if (CheckZeros(Shift, Scale, Left))
> + for (SDValue V : {V1, V2})
> + if (SDValue Match = MatchShift(Shift, Scale, Left, V))
> + return Match;
>
> - // Generate vectors for predicate vectors.
> - if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
> - return LowerBUILD_VECTORvXi1(Op, DAG);
> + // no match
> + return SDValue();
> +}
>
> - // Vectors containing all zeros can be matched by pxor and xorps later
> - if (ISD::isBuildVectorAllZeros(Op.getNode())) {
> - // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
> - // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
> - if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
> - return Op;
> +/// \brief Lower a vector shuffle as a zero or any extension.
> +///
> +/// Given a specific number of elements, element bit width, and extension
> +/// stride, produce either a zero or any extension based on the available
> +/// features of the subtarget.
> +static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
> + SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
> + const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> + assert(Scale > 1 && "Need a scale to extend.");
> + int NumElements = VT.getVectorNumElements();
> + int EltBits = VT.getScalarSizeInBits();
> + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
> + "Only 8, 16, and 32 bit elements can be extended.");
> + assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
>
> - return getZeroVector(VT, Subtarget, DAG, dl);
> + // Found a valid zext mask! Try various lowering strategies based on the
> + // input type and available ISA extensions.
> + if (Subtarget->hasSSE41()) {
> + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
> + NumElements / Scale);
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
> }
>
> - // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
> - // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
> - // vpcmpeqd on 256-bit vectors.
> - if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
> - if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
> - return Op;
> -
> - if (!VT.is512BitVector())
> - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
> + // For any extends we can cheat for larger element sizes and use shuffle
> + // instructions that can fold with a load and/or copy.
> + if (AnyExt && EltBits == 32) {
> + int PSHUFDMask[4] = {0, -1, 1, -1};
> + return DAG.getNode(
> + ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
> + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> }
> -
> - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
> - if (Broadcast.getNode())
> - return Broadcast;
> -
> - unsigned EVTBits = ExtVT.getSizeInBits();
> -
> - unsigned NumZero = 0;
> - unsigned NumNonZero = 0;
> - unsigned NonZeros = 0;
> - bool IsAllConstants = true;
> - SmallSet<SDValue, 8> Values;
> - for (unsigned i = 0; i < NumElems; ++i) {
> - SDValue Elt = Op.getOperand(i);
> - if (Elt.getOpcode() == ISD::UNDEF)
> - continue;
> - Values.insert(Elt);
> - if (Elt.getOpcode() != ISD::Constant &&
> - Elt.getOpcode() != ISD::ConstantFP)
> - IsAllConstants = false;
> - if (X86::isZeroNode(Elt))
> - NumZero++;
> - else {
> - NonZeros |= (1 << i);
> - NumNonZero++;
> - }
> + if (AnyExt && EltBits == 16 && Scale > 2) {
> + int PSHUFDMask[4] = {0, -1, 0, -1};
> + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
> + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
> + int PSHUFHWMask[4] = {1, -1, -1, -1};
> + return DAG.getNode(
> + ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
> + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
> }
>
> - // All undef vector. Return an UNDEF. All zero vectors were handled above.
> - if (NumNonZero == 0)
> - return DAG.getUNDEF(VT);
> -
> - // Special case for single non-zero, non-undef, element.
> - if (NumNonZero == 1) {
> - unsigned Idx = countTrailingZeros(NonZeros);
> - SDValue Item = Op.getOperand(Idx);
> + // If this would require more than 2 unpack instructions to expand, use
> + // pshufb when available. We can only use more than 2 unpack instructions
> + // when zero extending i8 elements which also makes it easier to use pshufb.
> + if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
> + assert(NumElements == 16 && "Unexpected byte vector width!");
> + SDValue PSHUFBMask[16];
> + for (int i = 0; i < 16; ++i)
> + PSHUFBMask[i] =
> + DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
> + InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
> + DAG.getNode(ISD::BUILD_VECTOR, DL,
> + MVT::v16i8, PSHUFBMask)));
> + }
>
> - // If this is an insertion of an i64 value on x86-32, and if the top bits of
> - // the value are obviously zero, truncate the value to i32 and do the
> - // insertion that way. Only do this if the value is non-constant or if the
> - // value is a constant being inserted into element 0. It is cheaper to do
> - // a constant pool load than it is to do a movd + shuffle.
> - if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
> - (!IsAllConstants || Idx == 0)) {
> - if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
> - // Handle SSE only.
> - assert(VT == MVT::v2i64 && "Expected an SSE value type!");
> - EVT VecVT = MVT::v4i32;
> - unsigned VecElts = 4;
> + // Otherwise emit a sequence of unpacks.
> + do {
> + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
> + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
> + : getZeroVector(InputVT, Subtarget, DAG, DL);
> + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
> + InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
> + Scale /= 2;
> + EltBits *= 2;
> + NumElements /= 2;
> + } while (Scale > 1);
> + return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
> +}
>
> - // Truncate the value (which may itself be a constant) to i32, and
> - // convert it to a vector with movd (S2V+shuffle to zero extend).
> - Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
> - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
> +/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
> +///
> +/// This routine will try to do everything in its power to cleverly lower
> +/// a shuffle which happens to match the pattern of a zero extend. It doesn't
> +/// check for the profitability of this lowering, it tries to aggressively
> +/// match this pattern. It will use all of the micro-architectural details it
> +/// can to emit an efficient lowering. It handles both blends with all-zero
> +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
> +/// masking out later).
> +///
> +/// The reason we have dedicated lowering for zext-style shuffles is that they
> +/// are both incredibly common and often quite performance sensitive.
> +static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
> + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
>
> - // If using the new shuffle lowering, just directly insert this.
> - if (ExperimentalVectorShuffleLowering)
> - return DAG.getNode(
> - ISD::BITCAST, dl, VT,
> - getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
> -
> - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
> -
> - // Now we have our 32-bit value zero extended in the low element of
> - // a vector. If Idx != 0, swizzle it into place.
> - if (Idx != 0) {
> - SmallVector<int, 4> Mask;
> - Mask.push_back(Idx);
> - for (unsigned i = 1; i != VecElts; ++i)
> - Mask.push_back(i);
> - Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
> - &Mask[0]);
> - }
> - return DAG.getNode(ISD::BITCAST, dl, VT, Item);
> - }
> - }
> + int Bits = VT.getSizeInBits();
> + int NumElements = VT.getVectorNumElements();
> + assert(VT.getScalarSizeInBits() <= 32 &&
> + "Exceeds 32-bit integer zero extension limit");
> + assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
>
> - // If we have a constant or non-constant insertion into the low element of
> - // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
> - // the rest of the elements. This will be matched as movd/movq/movss/movsd
> - // depending on what the source datatype is.
> - if (Idx == 0) {
> - if (NumZero == 0)
> - return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
> + // Define a helper function to check a particular ext-scale and lower to it if
> + // valid.
> + auto Lower = [&](int Scale) -> SDValue {
> + SDValue InputV;
> + bool AnyExt = true;
> + for (int i = 0; i < NumElements; ++i) {
> + if (Mask[i] == -1)
> + continue; // Valid anywhere but doesn't tell us anything.
> + if (i % Scale != 0) {
> + // Each of the extended elements need to be zeroable.
> + if (!Zeroable[i])
> + return SDValue();
>
> - if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
> - (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
> - if (VT.is256BitVector() || VT.is512BitVector()) {
> - SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
> - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
> - Item, DAG.getIntPtrConstant(0));
> - }
> - assert(VT.is128BitVector() && "Expected an SSE value type!");
> - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
> - // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
> - return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
> + // We no longer are in the anyext case.
> + AnyExt = false;
> + continue;
> }
>
> - if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
> - Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
> - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
> - if (VT.is256BitVector()) {
> - SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
> - Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
> - } else {
> - assert(VT.is128BitVector() && "Expected an SSE value type!");
> - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
> - }
> - return DAG.getNode(ISD::BITCAST, dl, VT, Item);
> - }
> - }
> + // Each of the base elements needs to be consecutive indices into the
> + // same input vector.
> + SDValue V = Mask[i] < NumElements ? V1 : V2;
> + if (!InputV)
> + InputV = V;
> + else if (InputV != V)
> + return SDValue(); // Flip-flopping inputs.
>
> - // Is it a vector logical left shift?
> - if (NumElems == 2 && Idx == 1 &&
> - X86::isZeroNode(Op.getOperand(0)) &&
> - !X86::isZeroNode(Op.getOperand(1))) {
> - unsigned NumBits = VT.getSizeInBits();
> - return getVShift(true, VT,
> - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
> - VT, Op.getOperand(1)),
> - NumBits/2, DAG, *this, dl);
> + if (Mask[i] % NumElements != i / Scale)
> + return SDValue(); // Non-consecutive strided elements.
> }
>
> - if (IsAllConstants) // Otherwise, it's better to do a constpool load.
> + // If we fail to find an input, we have a zero-shuffle which should always
> + // have already been handled.
> + // FIXME: Maybe handle this here in case during blending we end up with one?
> + if (!InputV)
> return SDValue();
>
> - // Otherwise, if this is a vector with i32 or f32 elements, and the element
> - // is a non-constant being inserted into an element other than the low one,
> - // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
> - // movd/movss) to move this into the low element, then shuffle it into
> - // place.
> - if (EVTBits == 32) {
> - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
> + return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
> + DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
> + };
>
> - // If using the new shuffle lowering, just directly insert this.
> - if (ExperimentalVectorShuffleLowering)
> - return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
> -
> - // Turn it into a shuffle of zero and zero-extended scalar to vector.
> - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
> - SmallVector<int, 8> MaskVec;
> - for (unsigned i = 0; i != NumElems; ++i)
> - MaskVec.push_back(i == Idx ? 0 : 1);
> - return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
> - }
> - }
> + // The widest scale possible for extending is to a 64-bit integer.
> + assert(Bits % 64 == 0 &&
> + "The number of bits in a vector must be divisible by 64 on x86!");
> + int NumExtElements = Bits / 64;
>
> - // Splat is obviously ok. Let legalizer expand it to a shuffle.
> - if (Values.size() == 1) {
> - if (EVTBits == 32) {
> - // Instead of a shuffle like this:
> - // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
> - // Check if it's possible to issue this instead.
> - // shuffle (vload ptr)), undef, <1, 1, 1, 1>
> - unsigned Idx = countTrailingZeros(NonZeros);
> - SDValue Item = Op.getOperand(Idx);
> - if (Op.getNode()->isOnlyUserOf(Item.getNode()))
> - return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
> - }
> - return SDValue();
> + // Each iteration, try extending the elements half as much, but into twice as
> + // many elements.
> + for (; NumExtElements < NumElements; NumExtElements *= 2) {
> + assert(NumElements % NumExtElements == 0 &&
> + "The input vector size must be divisible by the extended size.");
> + if (SDValue V = Lower(NumElements / NumExtElements))
> + return V;
> }
>
> - // A vector full of immediates; various special cases are already
> - // handled, so this is best done with a single constant-pool load.
> - if (IsAllConstants)
> + // General extends failed, but 128-bit vectors may be able to use MOVQ.
> + if (Bits != 128)
> return SDValue();
>
> - // For AVX-length vectors, see if we can use a vector load to get all of the
> - // elements, otherwise build the individual 128-bit pieces and use
> - // shuffles to put them in place.
> - if (VT.is256BitVector() || VT.is512BitVector()) {
> - SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
> + // Returns one of the source operands if the shuffle can be reduced to a
> + // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
> + auto CanZExtLowHalf = [&]() {
> + for (int i = NumElements / 2; i != NumElements; ++i)
> + if (!Zeroable[i])
> + return SDValue();
> + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
> + return V1;
> + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
> + return V2;
> + return SDValue();
> + };
>
> - // Check for a build vector of consecutive loads.
> - if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
> - return LD;
> + if (SDValue V = CanZExtLowHalf()) {
> + V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
> + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
> + return DAG.getNode(ISD::BITCAST, DL, VT, V);
> + }
>
> - EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
> + // No viable ext lowering found.
> + return SDValue();
> +}
>
> - // Build both the lower and upper subvector.
> - SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
> - makeArrayRef(&V[0], NumElems/2));
> - SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
> - makeArrayRef(&V[NumElems / 2], NumElems/2));
> +/// \brief Try to get a scalar value for a specific element of a vector.
> +///
> +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
> +static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
> + SelectionDAG &DAG) {
> + MVT VT = V.getSimpleValueType();
> + MVT EltVT = VT.getVectorElementType();
> + while (V.getOpcode() == ISD::BITCAST)
> + V = V.getOperand(0);
> + // If the bitcasts shift the element size, we can't extract an equivalent
> + // element from it.
> + MVT NewVT = V.getSimpleValueType();
> + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
> + return SDValue();
>
> - // Recreate the wider vector with the lower and upper part.
> - if (VT.is256BitVector())
> - return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
> - return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
> - }
> + if (V.getOpcode() == ISD::BUILD_VECTOR ||
> + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
> + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
>
> - // Let legalizer expand 2-wide build_vectors.
> - if (EVTBits == 64) {
> - if (NumNonZero == 1) {
> - // One half is zero or undef.
> - unsigned Idx = countTrailingZeros(NonZeros);
> - SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
> - Op.getOperand(Idx));
> - return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
> - }
> - return SDValue();
> - }
> + return SDValue();
> +}
>
> - // If element VT is < 32 bits, convert it to inserts into a zero vector.
> - if (EVTBits == 8 && NumElems == 16) {
> - SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
> - Subtarget, *this);
> - if (V.getNode()) return V;
> - }
> +/// \brief Helper to test for a load that can be folded with x86 shuffles.
> +///
> +/// This is particularly important because the set of instructions varies
> +/// significantly based on whether the operand is a load or not.
> +static bool isShuffleFoldableLoad(SDValue V) {
> + while (V.getOpcode() == ISD::BITCAST)
> + V = V.getOperand(0);
>
> - if (EVTBits == 16 && NumElems == 8) {
> - SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
> - Subtarget, *this);
> - if (V.getNode()) return V;
> - }
> + return ISD::isNON_EXTLoad(V.getNode());
> +}
>
> - // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
> - if (EVTBits == 32 && NumElems == 4) {
> - SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
> - if (V.getNode())
> - return V;
> - }
> +/// \brief Try to lower insertion of a single element into a zero vector.
> +///
> +/// This is a common pattern that we have especially efficient patterns to lower
> +/// across all subtarget feature sets.
> +static SDValue lowerVectorShuffleAsElementInsertion(
> + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> + MVT ExtVT = VT;
> + MVT EltVT = VT.getVectorElementType();
>
> - // If element VT is == 32 bits, turn it into a number of shuffles.
> - SmallVector<SDValue, 8> V(NumElems);
> - if (NumElems == 4 && NumZero > 0) {
> - for (unsigned i = 0; i < 4; ++i) {
> - bool isZero = !(NonZeros & (1 << i));
> - if (isZero)
> - V[i] = getZeroVector(VT, Subtarget, DAG, dl);
> - else
> - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
> + int V2Index = std::find_if(Mask.begin(), Mask.end(),
> + [&Mask](int M) { return M >= (int)Mask.size(); }) -
> + Mask.begin();
> + bool IsV1Zeroable = true;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (i != V2Index && !Zeroable[i]) {
> + IsV1Zeroable = false;
> + break;
> }
>
> - for (unsigned i = 0; i < 2; ++i) {
> - switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
> - default: break;
> - case 0:
> - V[i] = V[i*2]; // Must be a zero vector.
> - break;
> - case 1:
> - V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
> - break;
> - case 2:
> - V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
> - break;
> - case 3:
> - V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
> - break;
> - }
> - }
> + // Check for a single input from a SCALAR_TO_VECTOR node.
> + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
> + // all the smarts here sunk into that routine. However, the current
> + // lowering of BUILD_VECTOR makes that nearly impossible until the old
> + // vector shuffle lowering is dead.
> + if (SDValue V2S = getScalarValueForVectorElement(
> + V2, Mask[V2Index] - Mask.size(), DAG)) {
> + // We need to zext the scalar if it is smaller than an i32.
> + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
> + if (EltVT == MVT::i8 || EltVT == MVT::i16) {
> + // Using zext to expand a narrow element won't work for non-zero
> + // insertions.
> + if (!IsV1Zeroable)
> + return SDValue();
>
> - bool Reverse1 = (NonZeros & 0x3) == 2;
> - bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
> - int MaskVec[] = {
> - Reverse1 ? 1 : 0,
> - Reverse1 ? 0 : 1,
> - static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
> - static_cast<int>(Reverse2 ? NumElems : NumElems+1)
> - };
> - return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
> + // Zero-extend directly to i32.
> + ExtVT = MVT::v4i32;
> + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
> + }
> + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
> + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
> + EltVT == MVT::i16) {
> + // Either not inserting from the low element of the input or the input
> + // element size is too small to use VZEXT_MOVL to clear the high bits.
> + return SDValue();
> }
>
> - if (Values.size() > 1 && VT.is128BitVector()) {
> - // Check for a build vector of consecutive loads.
> - for (unsigned i = 0; i < NumElems; ++i)
> - V[i] = Op.getOperand(i);
> -
> - // Check for elements which are consecutive loads.
> - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
> - if (LD.getNode())
> - return LD;
> -
> - // Check for a build vector from mostly shuffle plus few inserting.
> - SDValue Sh = buildFromShuffleMostly(Op, DAG);
> - if (Sh.getNode())
> - return Sh;
> -
> - // For SSE 4.1, use insertps to put the high elements into the low element.
> - if (Subtarget->hasSSE41()) {
> - SDValue Result;
> - if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
> - Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
> - else
> - Result = DAG.getUNDEF(VT);
> + if (!IsV1Zeroable) {
> + // If V1 can't be treated as a zero vector we have fewer options to lower
> + // this. We can't support integer vectors or non-zero targets cheaply, and
> + // the V1 elements can't be permuted in any way.
> + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
> + if (!VT.isFloatingPoint() || V2Index != 0)
> + return SDValue();
> + SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
> + V1Mask[V2Index] = -1;
> + if (!isNoopShuffleMask(V1Mask))
> + return SDValue();
> + // This is essentially a special case blend operation, but if we have
> + // general purpose blend operations, they are always faster. Bail and let
> + // the rest of the lowering handle these as blends.
> + if (Subtarget->hasSSE41())
> + return SDValue();
>
> - for (unsigned i = 1; i < NumElems; ++i) {
> - if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
> - Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
> - Op.getOperand(i), DAG.getIntPtrConstant(i));
> - }
> - return Result;
> - }
> + // Otherwise, use MOVSD or MOVSS.
> + assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
> + "Only two types of floating point element types to handle!");
> + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
> + ExtVT, V1, V2);
> + }
>
> - // Otherwise, expand into a number of unpckl*, start by extending each of
> - // our (non-undef) elements to the full vector width with the element in the
> - // bottom slot of the vector (which generates no code for SSE).
> - for (unsigned i = 0; i < NumElems; ++i) {
> - if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
> - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
> - else
> - V[i] = DAG.getUNDEF(VT);
> - }
> + // This lowering only works for the low element with floating point vectors.
> + if (VT.isFloatingPoint() && V2Index != 0)
> + return SDValue();
>
> - // Next, we iteratively mix elements, e.g. for v4f32:
> - // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
> - // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
> - // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
> - unsigned EltStride = NumElems >> 1;
> - while (EltStride != 0) {
> - for (unsigned i = 0; i < EltStride; ++i) {
> - // If V[i+EltStride] is undef and this is the first round of mixing,
> - // then it is safe to just drop this shuffle: V[i] is already in the
> - // right place, the one element (since it's the first round) being
> - // inserted as undef can be dropped. This isn't safe for successive
> - // rounds because they will permute elements within both vectors.
> - if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
> - EltStride == NumElems/2)
> - continue;
> + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
> + if (ExtVT != VT)
> + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
>
> - V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
> - }
> - EltStride >>= 1;
> + if (V2Index != 0) {
> + // If we have 4 or fewer lanes we can cheaply shuffle the element into
> + // the desired position. Otherwise it is more efficient to do a vector
> + // shift left. We know that we can do a vector shift left because all
> + // the inputs are zero.
> + if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
> + SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
> + V2Shuffle[V2Index] = 0;
> + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
> + } else {
> + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
> + V2 = DAG.getNode(
> + X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
> + DAG.getConstant(
> + V2Index * EltVT.getSizeInBits()/8,
> + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
> + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
> }
> - return V[0];
> }
> - return SDValue();
> + return V2;
> }
>
> -// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
> -// to create 256-bit vectors from two other 128-bit ones.
> -static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
> - SDLoc dl(Op);
> - MVT ResVT = Op.getSimpleValueType();
> -
> - assert((ResVT.is256BitVector() ||
> - ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
> -
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - unsigned NumElems = ResVT.getVectorNumElements();
> - if(ResVT.is256BitVector())
> - return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
> +/// \brief Try to lower broadcast of a single element.
> +///
> +/// For convenience, this code also bundles all of the subtarget feature set
> +/// filtering. While a little annoying to re-dispatch on type here, there isn't
> +/// a convenient way to factor it out.
> +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
> + ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + if (!Subtarget->hasAVX())
> + return SDValue();
> + if (VT.isInteger() && !Subtarget->hasAVX2())
> + return SDValue();
>
> - if (Op.getNumOperands() == 4) {
> - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
> - ResVT.getVectorNumElements()/2);
> - SDValue V3 = Op.getOperand(2);
> - SDValue V4 = Op.getOperand(3);
> - return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
> - Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
> - }
> - return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
> -}
> + // Check that the mask is a broadcast.
> + int BroadcastIdx = -1;
> + for (int M : Mask)
> + if (M >= 0 && BroadcastIdx == -1)
> + BroadcastIdx = M;
> + else if (M >= 0 && M != BroadcastIdx)
> + return SDValue();
>
> -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
> - MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
> - assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
> - (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
> - Op.getNumOperands() == 4)));
> + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
> + "a sorted mask where the broadcast "
> + "comes from V1.");
>
> - // AVX can use the vinsertf128 instruction to create 256-bit vectors
> - // from two other 128-bit ones.
> + // Go up the chain of (vector) values to try and find a scalar load that
> + // we can combine with the broadcast.
> + for (;;) {
> + switch (V.getOpcode()) {
> + case ISD::CONCAT_VECTORS: {
> + int OperandSize = Mask.size() / V.getNumOperands();
> + V = V.getOperand(BroadcastIdx / OperandSize);
> + BroadcastIdx %= OperandSize;
> + continue;
> + }
>
> - // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
> - return LowerAVXCONCAT_VECTORS(Op, DAG);
> -}
> + case ISD::INSERT_SUBVECTOR: {
> + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
> + auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
> + if (!ConstantIdx)
> + break;
>
> + int BeginIdx = (int)ConstantIdx->getZExtValue();
> + int EndIdx =
> + BeginIdx + (int)VInner.getValueType().getVectorNumElements();
> + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
> + BroadcastIdx -= BeginIdx;
> + V = VInner;
> + } else {
> + V = VOuter;
> + }
> + continue;
> + }
> + }
> + break;
> + }
>
> -//===----------------------------------------------------------------------===//
> -// Vector shuffle lowering
> -//
> -// This is an experimental code path for lowering vector shuffles on x86. It is
> -// designed to handle arbitrary vector shuffles and blends, gracefully
> -// degrading performance as necessary. It works hard to recognize idiomatic
> -// shuffles and lower them to optimal instruction patterns without leaving
> -// a framework that allows reasonably efficient handling of all vector shuffle
> -// patterns.
> -//===----------------------------------------------------------------------===//
> + // Check if this is a broadcast of a scalar. We special case lowering
> + // for scalars so that we can more effectively fold with loads.
> + if (V.getOpcode() == ISD::BUILD_VECTOR ||
> + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
> + V = V.getOperand(BroadcastIdx);
>
> -/// \brief Tiny helper function to identify a no-op mask.
> -///
> -/// This is a somewhat boring predicate function. It checks whether the mask
> -/// array input, which is assumed to be a single-input shuffle mask of the kind
> -/// used by the X86 shuffle instructions (not a fully general
> -/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
> -/// in-place shuffle are 'no-op's.
> -static bool isNoopShuffleMask(ArrayRef<int> Mask) {
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (Mask[i] != -1 && Mask[i] != i)
> - return false;
> - return true;
> -}
> + // If the scalar isn't a load we can't broadcast from it in AVX1, only with
> + // AVX2.
> + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
> + return SDValue();
> + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
> + // We can't broadcast from a vector register w/o AVX2, and we can only
> + // broadcast from the zero-element of a vector register.
> + return SDValue();
> + }
>
> -/// \brief Helper function to classify a mask as a single-input mask.
> -///
> -/// This isn't a generic single-input test because in the vector shuffle
> -/// lowering we canonicalize single inputs to be the first input operand. This
> -/// means we can more quickly test for a single input by only checking whether
> -/// an input from the second operand exists. We also assume that the size of
> -/// mask corresponds to the size of the input vectors which isn't true in the
> -/// fully general case.
> -static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
> - for (int M : Mask)
> - if (M >= (int)Mask.size())
> - return false;
> - return true;
> + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
> }
>
> -/// \brief Test whether there are elements crossing 128-bit lanes in this
> -/// shuffle mask.
> -///
> -/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
> -/// and we routinely test for these.
> -static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
> - int LaneSize = 128 / VT.getScalarSizeInBits();
> - int Size = Mask.size();
> - for (int i = 0; i < Size; ++i)
> - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
> - return true;
> - return false;
> -}
> +// Check for whether we can use INSERTPS to perform the shuffle. We only use
> +// INSERTPS when the V1 elements are already in the correct locations
> +// because otherwise we can just always use two SHUFPS instructions which
> +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
> +// perform INSERTPS if a single V1 element is out of place and all V2
> +// elements are zeroable.
> +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
> + ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
>
> -/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
> -///
> -/// This checks a shuffle mask to see if it is performing the same
> -/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
> -/// that it is also not lane-crossing. It may however involve a blend from the
> -/// same lane of a second vector.
> -///
> -/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
> -/// non-trivial to compute in the face of undef lanes. The representation is
> -/// *not* suitable for use with existing 128-bit shuffles as it will contain
> -/// entries from both V1 and V2 inputs to the wider mask.
> -static bool
> -is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
> - SmallVectorImpl<int> &RepeatedMask) {
> - int LaneSize = 128 / VT.getScalarSizeInBits();
> - RepeatedMask.resize(LaneSize, -1);
> - int Size = Mask.size();
> - for (int i = 0; i < Size; ++i) {
> - if (Mask[i] < 0)
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> +
> + unsigned ZMask = 0;
> + int V1DstIndex = -1;
> + int V2DstIndex = -1;
> + bool V1UsedInPlace = false;
> +
> + for (int i = 0; i < 4; ++i) {
> + // Synthesize a zero mask from the zeroable elements (includes undefs).
> + if (Zeroable[i]) {
> + ZMask |= 1 << i;
> continue;
> - if ((Mask[i] % Size) / LaneSize != i / LaneSize)
> - // This entry crosses lanes, so there is no way to model this shuffle.
> - return false;
> + }
>
> - // Ok, handle the in-lane shuffles by detecting if and when they repeat.
> - if (RepeatedMask[i % LaneSize] == -1)
> - // This is the first non-undef entry in this slot of a 128-bit lane.
> - RepeatedMask[i % LaneSize] =
> - Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
> - else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
> - // Found a mismatch with the repeated mask.
> - return false;
> + // Flag if we use any V1 inputs in place.
> + if (i == Mask[i]) {
> + V1UsedInPlace = true;
> + continue;
> + }
> +
> + // We can only insert a single non-zeroable element.
> + if (V1DstIndex != -1 || V2DstIndex != -1)
> + return SDValue();
> +
> + if (Mask[i] < 4) {
> + // V1 input out of place for insertion.
> + V1DstIndex = i;
> + } else {
> + // V2 input for insertion.
> + V2DstIndex = i;
> + }
> }
> - return true;
> -}
>
> -/// \brief Base case helper for testing a single mask element.
> -static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
> - BuildVectorSDNode *BV1,
> - BuildVectorSDNode *BV2, ArrayRef<int> Mask,
> - int i, int Arg) {
> - int Size = Mask.size();
> - if (Mask[i] != -1 && Mask[i] != Arg) {
> - auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
> - auto *ArgsBV = Arg < Size ? BV1 : BV2;
> - if (!MaskBV || !ArgsBV ||
> - MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
> - return false;
> + // Don't bother if we have no (non-zeroable) element for insertion.
> + if (V1DstIndex == -1 && V2DstIndex == -1)
> + return SDValue();
> +
> + // Determine element insertion src/dst indices. The src index is from the
> + // start of the inserted vector, not the start of the concatenated vector.
> + unsigned V2SrcIndex = 0;
> + if (V1DstIndex != -1) {
> + // If we have a V1 input out of place, we use V1 as the V2 element insertion
> + // and don't use the original V2 at all.
> + V2SrcIndex = Mask[V1DstIndex];
> + V2DstIndex = V1DstIndex;
> + V2 = V1;
> + } else {
> + V2SrcIndex = Mask[V2DstIndex] - 4;
> }
> - return true;
> -}
>
> -/// \brief Recursive helper to peel off and test each mask element.
> -template <typename... Ts>
> -static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
> - BuildVectorSDNode *BV1,
> - BuildVectorSDNode *BV2, ArrayRef<int> Mask,
> - int i, int Arg, Ts... Args) {
> - if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
> - return false;
> + // If no V1 inputs are used in place, then the result is created only from
> + // the zero mask and the V2 insertion - so remove V1 dependency.
> + if (!V1UsedInPlace)
> + V1 = DAG.getUNDEF(MVT::v4f32);
>
> - return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
> -}
> + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
> + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
>
> -/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
> -/// arguments.
> -///
> -/// This is a fast way to test a shuffle mask against a fixed pattern:
> -///
> -/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
> -///
> -/// It returns true if the mask is exactly as wide as the argument list, and
> -/// each element of the mask is either -1 (signifying undef) or the value given
> -/// in the argument.
> -template <typename... Ts>
> -static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
> - Ts... Args) {
> - if (Mask.size() != sizeof...(Args))
> - return false;
> -
> - // If the values are build vectors, we can look through them to find
> - // equivalent inputs that make the shuffles equivalent.
> - auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
> - auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
> -
> - // Recursively peel off arguments and test them against the mask.
> - return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
> + // Insert the V2 element into the desired position.
> + SDLoc DL(Op);
> + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
> + DAG.getConstant(InsertPSMask, MVT::i8));
> }
>
> -/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
> -///
> -/// This helper function produces an 8-bit shuffle immediate corresponding to
> -/// the ubiquitous shuffle encoding scheme used in x86 instructions for
> -/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
> -/// example.
> +/// \brief Try to lower a shuffle as a permute of the inputs followed by an
> +/// UNPCK instruction.
> ///
> -/// NB: We rely heavily on "undef" masks preserving the input lane.
> -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
> +/// This specifically targets cases where we end up with alternating between
> +/// the two inputs, and so can permute them into something that feeds a single
> +/// UNPCK instruction. Note that this routine only targets integer vectors
> +/// because for floating point vectors we have a generalized SHUFPS lowering
> +/// strategy that handles everything that doesn't *exactly* match an unpack,
> +/// making this clever lowering unnecessary.
> +static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> SelectionDAG &DAG) {
> - assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
> - assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
> - assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
> - assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
> - assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
> + assert(!VT.isFloatingPoint() &&
> + "This routine only supports integer vectors.");
> + assert(!isSingleInputShuffleMask(Mask) &&
> + "This routine should only be used when blending two inputs.");
> + assert(Mask.size() >= 2 && "Single element masks are invalid.");
>
> - unsigned Imm = 0;
> - Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
> - Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
> - Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
> - Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
> - return DAG.getConstant(Imm, MVT::i8);
> -}
> + int Size = Mask.size();
>
> -/// \brief Try to emit a blend instruction for a shuffle using bit math.
> -///
> -/// This is used as a fallback approach when first class blend instructions are
> -/// unavailable. Currently it is only suitable for integer vectors, but could
> -/// be generalized for floating point vectors if desirable.
> -static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - assert(VT.isInteger() && "Only supports integer vector types!");
> - MVT EltVT = VT.getScalarType();
> - int NumEltBits = EltVT.getSizeInBits();
> - SDValue Zero = DAG.getConstant(0, EltVT);
> - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
> - SmallVector<SDValue, 16> MaskOps;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> - if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
> - return SDValue(); // Shuffled input!
> - MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
> - }
> + int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
> + return M >= 0 && M % Size < Size / 2;
> + });
> + int NumHiInputs = std::count_if(
> + Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
>
> - SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
> - V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
> - // We have to cast V2 around.
> - MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
> - V2 = DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
> - DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
> - DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
> - return DAG.getNode(ISD::OR, DL, VT, V1, V2);
> -}
> + bool UnpackLo = NumLoInputs >= NumHiInputs;
>
> -/// \brief Try to emit a blend instruction for a shuffle.
> -///
> -/// This doesn't do any checks for the availability of instructions for blending
> -/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
> -/// be matched in the backend with the type given. What it does check for is
> -/// that the shuffle mask is in fact a blend.
> -static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - unsigned BlendMask = 0;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> - if (Mask[i] >= Size) {
> - if (Mask[i] != i + Size)
> - return SDValue(); // Shuffled V2 input!
> - BlendMask |= 1u << i;
> - continue;
> - }
> - if (Mask[i] >= 0 && Mask[i] != i)
> - return SDValue(); // Shuffled V1 input!
> - }
> - switch (VT.SimpleTy) {
> - case MVT::v2f64:
> - case MVT::v4f32:
> - case MVT::v4f64:
> - case MVT::v8f32:
> - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
> - DAG.getConstant(BlendMask, MVT::i8));
> + auto TryUnpack = [&](MVT UnpackVT, int Scale) {
> + SmallVector<int, 32> V1Mask(Mask.size(), -1);
> + SmallVector<int, 32> V2Mask(Mask.size(), -1);
>
> - case MVT::v4i64:
> - case MVT::v8i32:
> - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
> - // FALLTHROUGH
> - case MVT::v2i64:
> - case MVT::v4i32:
> - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
> - // that instruction.
> - if (Subtarget->hasAVX2()) {
> - // Scale the blend by the number of 32-bit dwords per element.
> - int Scale = VT.getScalarSizeInBits() / 32;
> - BlendMask = 0;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (Mask[i] >= Size)
> - for (int j = 0; j < Scale; ++j)
> - BlendMask |= 1u << (i * Scale + j);
> + for (int i = 0; i < Size; ++i) {
> + if (Mask[i] < 0)
> + continue;
>
> - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
> - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
> - DAG.getConstant(BlendMask, MVT::i8)));
> - }
> - // FALLTHROUGH
> - case MVT::v8i16: {
> - // For integer shuffles we need to expand the mask and cast the inputs to
> - // v8i16s prior to blending.
> - int Scale = 8 / VT.getVectorNumElements();
> - BlendMask = 0;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (Mask[i] >= Size)
> - for (int j = 0; j < Scale; ++j)
> - BlendMask |= 1u << (i * Scale + j);
> + // Each element of the unpack contains Scale elements from this mask.
> + int UnpackIdx = i / Scale;
>
> - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
> - DAG.getConstant(BlendMask, MVT::i8)));
> - }
> + // We only handle the case where V1 feeds the first slots of the unpack.
> + // We rely on canonicalization to ensure this is the case.
> + if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
> + return SDValue();
>
> - case MVT::v16i16: {
> - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
> - SmallVector<int, 8> RepeatedMask;
> - if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
> - // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
> - assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
> - BlendMask = 0;
> - for (int i = 0; i < 8; ++i)
> - if (RepeatedMask[i] >= 16)
> - BlendMask |= 1u << i;
> - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
> - DAG.getConstant(BlendMask, MVT::i8));
> + // Setup the mask for this input. The indexing is tricky as we have to
> + // handle the unpack stride.
> + SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
> + VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
> + Mask[i] % Size;
> }
> - }
> - // FALLTHROUGH
> - case MVT::v16i8:
> - case MVT::v32i8: {
> - // Scale the blend by the number of bytes per element.
> - int Scale = VT.getScalarSizeInBits() / 8;
>
> - // This form of blend is always done on bytes. Compute the byte vector
> - // type.
> - MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
> + // If we will have to shuffle both inputs to use the unpack, check whether
> + // we can just unpack first and shuffle the result. If so, skip this unpack.
> + if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
> + !isNoopShuffleMask(V2Mask))
> + return SDValue();
>
> - // Compute the VSELECT mask. Note that VSELECT is really confusing in the
> - // mix of LLVM's code generator and the x86 backend. We tell the code
> - // generator that boolean values in the elements of an x86 vector register
> - // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
> - // mapping a select to operand #1, and 'false' mapping to operand #2. The
> - // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
> - // of the element (the remaining are ignored) and 0 in that high bit would
> - // mean operand #1 while 1 in the high bit would mean operand #2. So while
> - // the LLVM model for boolean values in vector elements gets the relevant
> - // bit set, it is set backwards and over constrained relative to x86's
> - // actual model.
> - SmallVector<SDValue, 32> VSELECTMask;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - for (int j = 0; j < Scale; ++j)
> - VSELECTMask.push_back(
> - Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
> - : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
> + // Shuffle the inputs into place.
> + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
> + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
>
> - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
> - return DAG.getNode(
> - ISD::BITCAST, DL, VT,
> - DAG.getNode(ISD::VSELECT, DL, BlendVT,
> - DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
> - V1, V2));
> - }
> + // Cast the inputs to the type we will use to unpack them.
> + V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
>
> - default:
> - llvm_unreachable("Not a supported integer vector type!");
> - }
> -}
> + // Unpack the inputs and cast the result back to the desired type.
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
> + DL, UnpackVT, V1, V2));
> + };
>
> -/// \brief Try to lower as a blend of elements from two inputs followed by
> -/// a single-input permutation.
> -///
> -/// This matches the pattern where we can blend elements from two inputs and
> -/// then reduce the shuffle to a single-input permutation.
> -static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2,
> - ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - // We build up the blend mask while checking whether a blend is a viable way
> - // to reduce the shuffle.
> - SmallVector<int, 32> BlendMask(Mask.size(), -1);
> - SmallVector<int, 32> PermuteMask(Mask.size(), -1);
> + // We try each unpack from the largest to the smallest to try and find one
> + // that fits this mask.
> + int OrigNumElements = VT.getVectorNumElements();
> + int OrigScalarSize = VT.getScalarSizeInBits();
> + for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
> + int Scale = ScalarSize / OrigScalarSize;
> + int NumElements = OrigNumElements / Scale;
> + MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
> + if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
> + return Unpack;
> + }
>
> - for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> - if (Mask[i] < 0)
> - continue;
> + // If none of the unpack-rooted lowerings worked (or were profitable) try an
> + // initial unpack.
> + if (NumLoInputs == 0 || NumHiInputs == 0) {
> + assert((NumLoInputs > 0 || NumHiInputs > 0) &&
> + "We have to have *some* inputs!");
> + int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
>
> - assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
> + // FIXME: We could consider the total complexity of the permute of each
> + // possible unpacking. Or at the least we should consider how many
> + // half-crossings are created.
> + // FIXME: We could consider commuting the unpacks.
>
> - if (BlendMask[Mask[i] % Size] == -1)
> - BlendMask[Mask[i] % Size] = Mask[i];
> - else if (BlendMask[Mask[i] % Size] != Mask[i])
> - return SDValue(); // Can't blend in the needed input!
> + SmallVector<int, 32> PermMask;
> + PermMask.assign(Size, -1);
> + for (int i = 0; i < Size; ++i) {
> + if (Mask[i] < 0)
> + continue;
>
> - PermuteMask[i] = Mask[i] % Size;
> + assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
> +
> + PermMask[i] =
> + 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
> + }
> + return DAG.getVectorShuffle(
> + VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
> + DL, VT, V1, V2),
> + DAG.getUNDEF(VT), PermMask);
> }
>
> - SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
> - return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
> + return SDValue();
> }
>
> -/// \brief Generic routine to decompose a shuffle and blend into indepndent
> -/// blends and permutes.
> +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
> ///
> -/// This matches the extremely common pattern for handling combined
> -/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
> -/// operations. It will try to pick the best arrangement of shuffles and
> -/// blends.
> -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
> - SDValue V1,
> - SDValue V2,
> - ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - // Shuffle the input elements into the desired positions in V1 and V2 and
> - // blend them together.
> - SmallVector<int, 32> V1Mask(Mask.size(), -1);
> - SmallVector<int, 32> V2Mask(Mask.size(), -1);
> - SmallVector<int, 32> BlendMask(Mask.size(), -1);
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (Mask[i] >= 0 && Mask[i] < Size) {
> - V1Mask[i] = Mask[i];
> - BlendMask[i] = i;
> - } else if (Mask[i] >= Size) {
> - V2Mask[i] = Mask[i] - Size;
> - BlendMask[i] = i + Size;
> - }
> -
> - // Try to lower with the simpler initial blend strategy unless one of the
> - // input shuffles would be a no-op. We prefer to shuffle inputs as the
> - // shuffle may be able to fold with a load or other benefit. However, when
> - // we'll have to do 2x as many shuffles in order to achieve this, blending
> - // first is a better strategy.
> - if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
> - if (SDValue BlendPerm =
> - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
> - return BlendPerm;
> +/// This is the basis function for the 2-lane 64-bit shuffles as we have full
> +/// support for floating point shuffles but not integer shuffles. These
> +/// instructions will incur a domain crossing penalty on some chips though so
> +/// it is better to avoid lowering through this for integer vectors where
> +/// possible.
> +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
>
> - V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
> - V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
> - return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
> -}
> + if (isSingleInputShuffleMask(Mask)) {
> + // Use low duplicate instructions for masks that match their pattern.
> + if (Subtarget->hasSSE3())
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
> + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
>
> -/// \brief Try to lower a vector shuffle as a byte rotation.
> -///
> -/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
> -/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
> -/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
> -/// try to generically lower a vector shuffle through such an pattern. It
> -/// does not check for the profitability of lowering either as PALIGNR or
> -/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
> -/// This matches shuffle vectors that look like:
> -///
> -/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
> -///
> -/// Essentially it concatenates V1 and V2, shifts right by some number of
> -/// elements, and takes the low elements as the result. Note that while this is
> -/// specified as a *right shift* because x86 is little-endian, it is a *left
> -/// rotate* of the vector lanes.
> -static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2,
> - ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
> + // Straight shuffle of a single input vector. Simulate this by using the
> + // single input as both of the "inputs" to this instruction..
> + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
>
> - int NumElts = Mask.size();
> - int NumLanes = VT.getSizeInBits() / 128;
> - int NumLaneElts = NumElts / NumLanes;
> + if (Subtarget->hasAVX()) {
> + // If we have AVX, we can use VPERMILPS which will allow folding a load
> + // into the shuffle.
> + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
>
> - // We need to detect various ways of spelling a rotation:
> - // [11, 12, 13, 14, 15, 0, 1, 2]
> - // [-1, 12, 13, 14, -1, -1, 1, -1]
> - // [-1, -1, -1, -1, -1, -1, 1, 2]
> - // [ 3, 4, 5, 6, 7, 8, 9, 10]
> - // [-1, 4, 5, 6, -1, -1, 9, -1]
> - // [-1, 4, 5, 6, -1, -1, -1, -1]
> - int Rotation = 0;
> - SDValue Lo, Hi;
> - for (int l = 0; l < NumElts; l += NumLaneElts) {
> - for (int i = 0; i < NumLaneElts; ++i) {
> - if (Mask[l + i] == -1)
> - continue;
> - assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
> + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
> + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
> + assert(Mask[1] >= 2 && "Non-canonicalized blend!");
>
> - // Get the mod-Size index and lane correct it.
> - int LaneIdx = (Mask[l + i] % NumElts) - l;
> - // Make sure it was in this lane.
> - if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
> - return SDValue();
> + // If we have a single input, insert that into V1 if we can do so cheaply.
> + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
> + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> + MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
> + return Insertion;
> + // Try inverting the insertion since for v2 masks it is easy to do and we
> + // can't reliably sort the mask one way or the other.
> + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
> + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
> + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> + MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
> + return Insertion;
> + }
>
> - // Determine where a rotated vector would have started.
> - int StartIdx = i - LaneIdx;
> - if (StartIdx == 0)
> - // The identity rotation isn't interesting, stop.
> - return SDValue();
> + // Try to use one of the special instruction patterns to handle two common
> + // blend patterns if a zero-blend above didn't work.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
> + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
> + // We can either use a special instruction to load over the low double or
> + // to move just the low double.
> + return DAG.getNode(
> + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
> + DL, MVT::v2f64, V2,
> + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
>
> - // If we found the tail of a vector the rotation must be the missing
> - // front. If we found the head of a vector, it must be how much of the
> - // head.
> - int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
> + if (Subtarget->hasSSE41())
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - if (Rotation == 0)
> - Rotation = CandidateRotation;
> - else if (Rotation != CandidateRotation)
> - // The rotations don't match, so we can't match this mask.
> - return SDValue();
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
>
> - // Compute which value this mask is pointing at.
> - SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
> + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
> + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> +}
>
> - // Compute which of the two target values this index should be assigned
> - // to. This reflects whether the high elements are remaining or the low
> - // elements are remaining.
> - SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
> +/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
> +///
> +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
> +/// the integer unit to minimize domain crossing penalties. However, for blends
> +/// it falls back to the floating point shuffle operation with appropriate bit
> +/// casting.
> +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
>
> - // Either set up this value if we've not encountered it before, or check
> - // that it remains consistent.
> - if (!TargetV)
> - TargetV = MaskV;
> - else if (TargetV != MaskV)
> - // This may be a rotation, but it pulls from the inputs in some
> - // unsupported interleaving.
> - return SDValue();
> - }
> - }
> + if (isSingleInputShuffleMask(Mask)) {
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // Check that we successfully analyzed the mask, and normalize the results.
> - assert(Rotation != 0 && "Failed to locate a viable rotation!");
> - assert((Lo || Hi) && "Failed to find a rotated input vector!");
> - if (!Lo)
> - Lo = Hi;
> - else if (!Hi)
> - Hi = Lo;
> -
> - // The actual rotate instruction rotates bytes, so we need to scale the
> - // rotation based on how many bytes are in the vector lane.
> - int Scale = 16 / NumLaneElts;
> -
> - // SSSE3 targets can use the palignr instruction.
> - if (Subtarget->hasSSSE3()) {
> - // Cast the inputs to i8 vector of correct length to match PALIGNR.
> - MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
> - Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
> - Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
> -
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
> - DAG.getConstant(Rotation * Scale, MVT::i8)));
> + // Straight shuffle of a single input vector. For everything from SSE2
> + // onward this has a single fast instruction with no scary immediates.
> + // We have to map the mask as it is actually a v4i32 shuffle instruction.
> + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
> + int WidenedMask[4] = {
> + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
> + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
> + return DAG.getNode(
> + ISD::BITCAST, DL, MVT::v2i64,
> + DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
> + getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
> }
> + assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
> + assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
> + assert(Mask[0] < 2 && "We sort V1 to be the first input.");
> + assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
>
> - assert(VT.getSizeInBits() == 128 &&
> - "Rotate-based lowering only supports 128-bit lowering!");
> - assert(Mask.size() <= 16 &&
> - "Can shuffle at most 16 bytes in a 128-bit vector!");
> -
> - // Default SSE2 implementation
> - int LoByteShift = 16 - Rotation * Scale;
> - int HiByteShift = Rotation * Scale;
> -
> - // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
> - Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
> - Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
> + // If we have a blend of two PACKUS operations an the blend aligns with the
> + // low and half halves, we can just merge the PACKUS operations. This is
> + // particularly important as it lets us merge shuffles that this routine itself
> + // creates.
> + auto GetPackNode = [](SDValue V) {
> + while (V.getOpcode() == ISD::BITCAST)
> + V = V.getOperand(0);
>
> - SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
> - DAG.getConstant(LoByteShift, MVT::i8));
> - SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
> - DAG.getConstant(HiByteShift, MVT::i8));
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
> -}
> + return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
> + };
> + if (SDValue V1Pack = GetPackNode(V1))
> + if (SDValue V2Pack = GetPackNode(V2))
> + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
> + DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
> + Mask[0] == 0 ? V1Pack.getOperand(0)
> + : V1Pack.getOperand(1),
> + Mask[1] == 2 ? V2Pack.getOperand(0)
> + : V2Pack.getOperand(1)));
>
> -/// \brief Compute whether each element of a shuffle is zeroable.
> -///
> -/// A "zeroable" vector shuffle element is one which can be lowered to zero.
> -/// Either it is an undef element in the shuffle mask, the element of the input
> -/// referenced is undef, or the element of the input referenced is known to be
> -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
> -/// as many lanes with this technique as possible to simplify the remaining
> -/// shuffle.
> -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
> - SDValue V1, SDValue V2) {
> - SmallBitVector Zeroable(Mask.size(), false);
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
> + return Shift;
>
> - while (V1.getOpcode() == ISD::BITCAST)
> - V1 = V1->getOperand(0);
> - while (V2.getOpcode() == ISD::BITCAST)
> - V2 = V2->getOperand(0);
> + // When loading a scalar and then shuffling it into a vector we can often do
> + // the insertion cheaply.
> + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> + MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
> + return Insertion;
> + // Try inverting the insertion since for v2 masks it is easy to do and we
> + // can't reliably sort the mask one way or the other.
> + int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
> + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> + MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
> + return Insertion;
>
> - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
> - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
> + // We have different paths for blend lowering, but they all must use the
> + // *exact* same predicate.
> + bool IsBlendSupported = Subtarget->hasSSE41();
> + if (IsBlendSupported)
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> - int M = Mask[i];
> - // Handle the easy cases.
> - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
> - Zeroable[i] = true;
> - continue;
> - }
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
>
> - // If this is an index into a build_vector node (which has the same number
> - // of elements), dig out the input value and use it.
> - SDValue V = M < Size ? V1 : V2;
> - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
> - continue;
> + // Try to use byte rotation instructions.
> + // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
> + if (Subtarget->hasSSSE3())
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> - SDValue Input = V.getOperand(M % Size);
> - // The UNDEF opcode check really should be dead code here, but not quite
> - // worth asserting on (it isn't invalid, just unexpected).
> - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
> - Zeroable[i] = true;
> - }
> + // If we have direct support for blends, we should lower by decomposing into
> + // a permute. That will be faster than the domain cross.
> + if (IsBlendSupported)
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
> + Mask, DAG);
>
> - return Zeroable;
> + // We implement this with SHUFPD which is pretty lame because it will likely
> + // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
> + // However, all the alternatives are still more cycles and newer chips don't
> + // have this problem. It would be really nice if x86 had better shuffles here.
> + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
> + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
> + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
> }
>
> -/// \brief Try to emit a bitmask instruction for a shuffle.
> +/// \brief Test whether this can be lowered with a single SHUFPS instruction.
> ///
> -/// This handles cases where we can model a blend exactly as a bitmask due to
> -/// one of the inputs being zeroable.
> -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - MVT EltVT = VT.getScalarType();
> - int NumEltBits = EltVT.getSizeInBits();
> - MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
> - SDValue Zero = DAG.getConstant(0, IntEltVT);
> - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
> - if (EltVT.isFloatingPoint()) {
> - Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
> - AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
> - }
> - SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> - SDValue V;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i) {
> - if (Zeroable[i])
> - continue;
> - if (Mask[i] % Size != i)
> - return SDValue(); // Not a blend.
> - if (!V)
> - V = Mask[i] < Size ? V1 : V2;
> - else if (V != (Mask[i] < Size ? V1 : V2))
> - return SDValue(); // Can only let one input through the mask.
> +/// This is used to disable more specialized lowerings when the shufps lowering
> +/// will happen to be efficient.
> +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
> + // This routine only handles 128-bit shufps.
> + assert(Mask.size() == 4 && "Unsupported mask size!");
>
> - VMaskOps[i] = AllOnes;
> - }
> - if (!V)
> - return SDValue(); // No non-zeroable elements!
> + // To lower with a single SHUFPS we need to have the low half and high half
> + // each requiring a single input.
> + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
> + return false;
> + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
> + return false;
>
> - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
> - V = DAG.getNode(VT.isFloatingPoint()
> - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
> - DL, VT, V, VMask);
> - return V;
> + return true;
> }
>
> -/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
> -///
> -/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
> -/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
> -/// matches elements from one of the input vectors shuffled to the left or
> -/// right with zeroable elements 'shifted in'. It handles both the strictly
> -/// bit-wise element shifts and the byte shift across an entire 128-bit double
> -/// quad word lane.
> +/// \brief Lower a vector shuffle using the SHUFPS instruction.
> ///
> -/// PSHL : (little-endian) left bit shift.
> -/// [ zz, 0, zz, 2 ]
> -/// [ -1, 4, zz, -1 ]
> -/// PSRL : (little-endian) right bit shift.
> -/// [ 1, zz, 3, zz]
> -/// [ -1, -1, 7, zz]
> -/// PSLLDQ : (little-endian) left byte shift
> -/// [ zz, 0, 1, 2, 3, 4, 5, 6]
> -/// [ zz, zz, -1, -1, 2, 3, 4, -1]
> -/// [ zz, zz, zz, zz, zz, zz, -1, 1]
> -/// PSRLDQ : (little-endian) right byte shift
> -/// [ 5, 6, 7, zz, zz, zz, zz, zz]
> -/// [ -1, 5, 6, 7, zz, zz, zz, zz]
> -/// [ 1, 2, -1, -1, -1, -1, zz, zz]
> -static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> -
> - int Size = Mask.size();
> - assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
> +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
> +/// It makes no assumptions about whether this is the *best* lowering, it simply
> +/// uses it.
> +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
> + ArrayRef<int> Mask, SDValue V1,
> + SDValue V2, SelectionDAG &DAG) {
> + SDValue LowV = V1, HighV = V2;
> + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
>
> - auto CheckZeros = [&](int Shift, int Scale, bool Left) {
> - for (int i = 0; i < Size; i += Scale)
> - for (int j = 0; j < Shift; ++j)
> - if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
> - return false;
> + int NumV2Elements =
> + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
>
> - return true;
> - };
> + if (NumV2Elements == 1) {
> + int V2Index =
> + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
> + Mask.begin();
>
> - auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
> - for (int i = 0; i != Size; i += Scale) {
> - unsigned Pos = Left ? i + Shift : i;
> - unsigned Low = Left ? i : i + Shift;
> - unsigned Len = Scale - Shift;
> - if (!isSequentialOrUndefInRange(Mask, Pos, Len,
> - Low + (V == V1 ? 0 : Size)))
> - return SDValue();
> + // Compute the index adjacent to V2Index and in the same half by toggling
> + // the low bit.
> + int V2AdjIndex = V2Index ^ 1;
> +
> + if (Mask[V2AdjIndex] == -1) {
> + // Handles all the cases where we have a single V2 element and an undef.
> + // This will only ever happen in the high lanes because we commute the
> + // vector otherwise.
> + if (V2Index < 2)
> + std::swap(LowV, HighV);
> + NewMask[V2Index] -= 4;
> + } else {
> + // Handle the case where the V2 element ends up adjacent to a V1 element.
> + // To make this work, blend them together as the first step.
> + int V1Index = V2AdjIndex;
> + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
> + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
> + getV4X86ShuffleImm8ForMask(BlendMask, DAG));
> +
> + // Now proceed to reconstruct the final blend as we have the necessary
> + // high or low half formed.
> + if (V2Index < 2) {
> + LowV = V2;
> + HighV = V1;
> + } else {
> + HighV = V2;
> + }
> + NewMask[V1Index] = 2; // We put the V1 element in V2[2].
> + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
> }
> + } else if (NumV2Elements == 2) {
> + if (Mask[0] < 4 && Mask[1] < 4) {
> + // Handle the easy case where we have V1 in the low lanes and V2 in the
> + // high lanes.
> + NewMask[2] -= 4;
> + NewMask[3] -= 4;
> + } else if (Mask[2] < 4 && Mask[3] < 4) {
> + // We also handle the reversed case because this utility may get called
> + // when we detect a SHUFPS pattern but can't easily commute the shuffle to
> + // arrange things in the right direction.
> + NewMask[0] -= 4;
> + NewMask[1] -= 4;
> + HighV = V1;
> + LowV = V2;
> + } else {
> + // We have a mixture of V1 and V2 in both low and high lanes. Rather than
> + // trying to place elements directly, just blend them and set up the final
> + // shuffle to place them.
>
> - int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
> - bool ByteShift = ShiftEltBits > 64;
> - unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
> - : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
> - int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
> + // The first two blend mask elements are for V1, the second two are for
> + // V2.
> + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
> + Mask[2] < 4 ? Mask[2] : Mask[3],
> + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
> + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
> + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
> + getV4X86ShuffleImm8ForMask(BlendMask, DAG));
>
> - // Normalize the scale for byte shifts to still produce an i64 element
> - // type.
> - Scale = ByteShift ? Scale / 2 : Scale;
> + // Now we do a normal shuffle of V1 by giving V1 as both operands to
> + // a blend.
> + LowV = HighV = V1;
> + NewMask[0] = Mask[0] < 4 ? 0 : 2;
> + NewMask[1] = Mask[0] < 4 ? 2 : 0;
> + NewMask[2] = Mask[2] < 4 ? 1 : 3;
> + NewMask[3] = Mask[2] < 4 ? 3 : 1;
> + }
> + }
> + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
> + getV4X86ShuffleImm8ForMask(NewMask, DAG));
> +}
>
> - // We need to round trip through the appropriate type for the shift.
> - MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
> - MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
> - assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
> - "Illegal integer vector type");
> - V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
> +/// \brief Lower 4-lane 32-bit floating point shuffles.
> +///
> +/// Uses instructions exclusively from the floating point unit to minimize
> +/// domain crossing penalties, as these are sufficient to implement all v4f32
> +/// shuffles.
> +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
>
> - V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
> - return DAG.getNode(ISD::BITCAST, DL, VT, V);
> - };
> + int NumV2Elements =
> + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
>
> - // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
> - // keep doubling the size of the integer elements up to that. We can
> - // then shift the elements of the integer vector by whole multiples of
> - // their width within the elements of the larger integer vector. Test each
> - // multiple to see if we can find a match with the moved element indices
> - // and that the shifted in elements are all zeroable.
> - for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
> - for (int Shift = 1; Shift != Scale; ++Shift)
> - for (bool Left : {true, false})
> - if (CheckZeros(Shift, Scale, Left))
> - for (SDValue V : {V1, V2})
> - if (SDValue Match = MatchShift(Shift, Scale, Left, V))
> - return Match;
> + if (NumV2Elements == 0) {
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // no match
> - return SDValue();
> -}
> + // Use even/odd duplicate instructions for masks that match their pattern.
> + if (Subtarget->hasSSE3()) {
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
> + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
> + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
> + }
>
> -/// \brief Lower a vector shuffle as a zero or any extension.
> -///
> -/// Given a specific number of elements, element bit width, and extension
> -/// stride, produce either a zero or any extension based on the available
> -/// features of the subtarget.
> -static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
> - SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
> - const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> - assert(Scale > 1 && "Need a scale to extend.");
> - int NumElements = VT.getVectorNumElements();
> - int EltBits = VT.getScalarSizeInBits();
> - assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
> - "Only 8, 16, and 32 bit elements can be extended.");
> - assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
> + if (Subtarget->hasAVX()) {
> + // If we have AVX, we can use VPERMILPS which will allow folding a load
> + // into the shuffle.
> + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
> + getV4X86ShuffleImm8ForMask(Mask, DAG));
> + }
>
> - // Found a valid zext mask! Try various lowering strategies based on the
> - // input type and available ISA extensions.
> - if (Subtarget->hasSSE41()) {
> - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
> - NumElements / Scale);
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
> + // Otherwise, use a straight shuffle of a single input vector. We pass the
> + // input vector to both operands to simulate this with a SHUFPS.
> + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
> + getV4X86ShuffleImm8ForMask(Mask, DAG));
> }
>
> - // For any extends we can cheat for larger element sizes and use shuffle
> - // instructions that can fold with a load and/or copy.
> - if (AnyExt && EltBits == 32) {
> - int PSHUFDMask[4] = {0, -1, 1, -1};
> - return DAG.getNode(
> - ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
> - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> - }
> - if (AnyExt && EltBits == 16 && Scale > 2) {
> - int PSHUFDMask[4] = {0, -1, 0, -1};
> - InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
> - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
> - int PSHUFHWMask[4] = {1, -1, -1, -1};
> - return DAG.getNode(
> - ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
> - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
> - }
> + // There are special ways we can lower some single-element blends. However, we
> + // have custom ways we can lower more complex single-element blends below that
> + // we defer to if both this and BLENDPS fail to match, so restrict this to
> + // when the V2 input is targeting element 0 of the mask -- that is the fast
> + // case here.
> + if (NumV2Elements == 1 && Mask[0] >= 4)
> + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
> + Mask, Subtarget, DAG))
> + return V;
>
> - // If this would require more than 2 unpack instructions to expand, use
> - // pshufb when available. We can only use more than 2 unpack instructions
> - // when zero extending i8 elements which also makes it easier to use pshufb.
> - if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
> - assert(NumElements == 16 && "Unexpected byte vector width!");
> - SDValue PSHUFBMask[16];
> - for (int i = 0; i < 16; ++i)
> - PSHUFBMask[i] =
> - DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
> - InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
> - DAG.getNode(ISD::BUILD_VECTOR, DL,
> - MVT::v16i8, PSHUFBMask)));
> + if (Subtarget->hasSSE41()) {
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
> +
> + // Use INSERTPS if we can complete the shuffle efficiently.
> + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
> + return V;
> +
> + if (!isSingleSHUFPSMask(Mask))
> + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
> + DL, MVT::v4f32, V1, V2, Mask, DAG))
> + return BlendPerm;
> }
>
> - // Otherwise emit a sequence of unpacks.
> - do {
> - MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
> - SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
> - : getZeroVector(InputVT, Subtarget, DAG, DL);
> - InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
> - InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
> - Scale /= 2;
> - EltBits *= 2;
> - NumElements /= 2;
> - } while (Scale > 1);
> - return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
> +
> + // Otherwise fall back to a SHUFPS lowering strategy.
> + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
> }
>
> -/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
> +/// \brief Lower 4-lane i32 vector shuffles.
> ///
> -/// This routine will try to do everything in its power to cleverly lower
> -/// a shuffle which happens to match the pattern of a zero extend. It doesn't
> -/// check for the profitability of this lowering, it tries to aggressively
> -/// match this pattern. It will use all of the micro-architectural details it
> -/// can to emit an efficient lowering. It handles both blends with all-zero
> -/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
> -/// masking out later).
> -///
> -/// The reason we have dedicated lowering for zext-style shuffles is that they
> -/// are both incredibly common and often quite performance sensitive.
> -static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
> - SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> -
> - int Bits = VT.getSizeInBits();
> - int NumElements = VT.getVectorNumElements();
> - assert(VT.getScalarSizeInBits() <= 32 &&
> - "Exceeds 32-bit integer zero extension limit");
> - assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
> -
> - // Define a helper function to check a particular ext-scale and lower to it if
> - // valid.
> - auto Lower = [&](int Scale) -> SDValue {
> - SDValue InputV;
> - bool AnyExt = true;
> - for (int i = 0; i < NumElements; ++i) {
> - if (Mask[i] == -1)
> - continue; // Valid anywhere but doesn't tell us anything.
> - if (i % Scale != 0) {
> - // Each of the extended elements need to be zeroable.
> - if (!Zeroable[i])
> - return SDValue();
> +/// We try to handle these with integer-domain shuffles where we can, but for
> +/// blends we use the floating point domain blend instructions.
> +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
>
> - // We no longer are in the anyext case.
> - AnyExt = false;
> - continue;
> - }
> + // Whenever we can lower this as a zext, that instruction is strictly faster
> + // than any alternative. It also allows us to fold memory operands into the
> + // shuffle in many cases.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
> + Mask, Subtarget, DAG))
> + return ZExt;
>
> - // Each of the base elements needs to be consecutive indices into the
> - // same input vector.
> - SDValue V = Mask[i] < NumElements ? V1 : V2;
> - if (!InputV)
> - InputV = V;
> - else if (InputV != V)
> - return SDValue(); // Flip-flopping inputs.
> + int NumV2Elements =
> + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
>
> - if (Mask[i] % NumElements != i / Scale)
> - return SDValue(); // Non-consecutive strided elements.
> - }
> + if (NumV2Elements == 0) {
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // If we fail to find an input, we have a zero-shuffle which should always
> - // have already been handled.
> - // FIXME: Maybe handle this here in case during blending we end up with one?
> - if (!InputV)
> - return SDValue();
> + // Straight shuffle of a single input vector. For everything from SSE2
> + // onward this has a single fast instruction with no scary immediates.
> + // We coerce the shuffle pattern to be compatible with UNPCK instructions
> + // but we aren't actually going to use the UNPCK instruction because doing
> + // so prevents folding a load into this instruction or making a copy.
> + const int UnpackLoMask[] = {0, 0, 1, 1};
> + const int UnpackHiMask[] = {2, 2, 3, 3};
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
> + Mask = UnpackLoMask;
> + else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
> + Mask = UnpackHiMask;
>
> - return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
> - DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
> - };
> + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
> + getV4X86ShuffleImm8ForMask(Mask, DAG));
> + }
>
> - // The widest scale possible for extending is to a 64-bit integer.
> - assert(Bits % 64 == 0 &&
> - "The number of bits in a vector must be divisible by 64 on x86!");
> - int NumExtElements = Bits / 64;
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
> + return Shift;
>
> - // Each iteration, try extending the elements half as much, but into twice as
> - // many elements.
> - for (; NumExtElements < NumElements; NumExtElements *= 2) {
> - assert(NumElements % NumExtElements == 0 &&
> - "The input vector size must be divisible by the extended size.");
> - if (SDValue V = Lower(NumElements / NumExtElements))
> + // There are special ways we can lower some single-element blends.
> + if (NumV2Elements == 1)
> + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
> + Mask, Subtarget, DAG))
> return V;
> - }
>
> - // General extends failed, but 128-bit vectors may be able to use MOVQ.
> - if (Bits != 128)
> - return SDValue();
> + // We have different paths for blend lowering, but they all must use the
> + // *exact* same predicate.
> + bool IsBlendSupported = Subtarget->hasSSE41();
> + if (IsBlendSupported)
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - // Returns one of the source operands if the shuffle can be reduced to a
> - // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
> - auto CanZExtLowHalf = [&]() {
> - for (int i = NumElements / 2; i != NumElements; ++i)
> - if (!Zeroable[i])
> - return SDValue();
> - if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
> - return V1;
> - if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
> - return V2;
> - return SDValue();
> - };
> + if (SDValue Masked =
> + lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
> + return Masked;
>
> - if (SDValue V = CanZExtLowHalf()) {
> - V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
> - V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
> - return DAG.getNode(ISD::BITCAST, DL, VT, V);
> - }
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
>
> - // No viable ext lowering found.
> - return SDValue();
> -}
> + // Try to use byte rotation instructions.
> + // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
> + if (Subtarget->hasSSSE3())
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> -/// \brief Try to get a scalar value for a specific element of a vector.
> -///
> -/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
> -static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
> - SelectionDAG &DAG) {
> - MVT VT = V.getSimpleValueType();
> - MVT EltVT = VT.getVectorElementType();
> - while (V.getOpcode() == ISD::BITCAST)
> - V = V.getOperand(0);
> - // If the bitcasts shift the element size, we can't extract an equivalent
> - // element from it.
> - MVT NewVT = V.getSimpleValueType();
> - if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
> - return SDValue();
> + // If we have direct support for blends, we should lower by decomposing into
> + // a permute. That will be faster than the domain cross.
> + if (IsBlendSupported)
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
> + Mask, DAG);
>
> - if (V.getOpcode() == ISD::BUILD_VECTOR ||
> - (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
> - return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
> + // Try to lower by permuting the inputs into an unpack instruction.
> + if (SDValue Unpack =
> + lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
> + return Unpack;
>
> - return SDValue();
> + // We implement this with SHUFPS because it can blend from two vectors.
> + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
> + // up the inputs, bypassing domain shift penalties that we would encur if we
> + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
> + // relevant.
> + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
> + DAG.getVectorShuffle(
> + MVT::v4f32, DL,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
> }
>
> -/// \brief Helper to test for a load that can be folded with x86 shuffles.
> +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
> +/// shuffle lowering, and the most complex part.
> ///
> -/// This is particularly important because the set of instructions varies
> -/// significantly based on whether the operand is a load or not.
> -static bool isShuffleFoldableLoad(SDValue V) {
> - while (V.getOpcode() == ISD::BITCAST)
> - V = V.getOperand(0);
> -
> - return ISD::isNON_EXTLoad(V.getNode());
> -}
> -
> -/// \brief Try to lower insertion of a single element into a zero vector.
> +/// The lowering strategy is to try to form pairs of input lanes which are
> +/// targeted at the same half of the final vector, and then use a dword shuffle
> +/// to place them onto the right half, and finally unpack the paired lanes into
> +/// their final position.
> ///
> -/// This is a common pattern that we have especially efficient patterns to lower
> -/// across all subtarget feature sets.
> -static SDValue lowerVectorShuffleAsElementInsertion(
> - MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> +/// The exact breakdown of how to form these dword pairs and align them on the
> +/// correct sides is really tricky. See the comments within the function for
> +/// more of the details.
> +static SDValue lowerV8I16SingleInputVectorShuffle(
> + SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
> const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> - MVT ExtVT = VT;
> - MVT EltVT = VT.getVectorElementType();
> + assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
> + MutableArrayRef<int> LoMask = Mask.slice(0, 4);
> + MutableArrayRef<int> HiMask = Mask.slice(4, 4);
>
> - int V2Index = std::find_if(Mask.begin(), Mask.end(),
> - [&Mask](int M) { return M >= (int)Mask.size(); }) -
> - Mask.begin();
> - bool IsV1Zeroable = true;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (i != V2Index && !Zeroable[i]) {
> - IsV1Zeroable = false;
> - break;
> - }
> + SmallVector<int, 4> LoInputs;
> + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
> + [](int M) { return M >= 0; });
> + std::sort(LoInputs.begin(), LoInputs.end());
> + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
> + SmallVector<int, 4> HiInputs;
> + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
> + [](int M) { return M >= 0; });
> + std::sort(HiInputs.begin(), HiInputs.end());
> + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
> + int NumLToL =
> + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
> + int NumHToL = LoInputs.size() - NumLToL;
> + int NumLToH =
> + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
> + int NumHToH = HiInputs.size() - NumLToH;
> + MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
> + MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
> + MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
> + MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
>
> - // Check for a single input from a SCALAR_TO_VECTOR node.
> - // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
> - // all the smarts here sunk into that routine. However, the current
> - // lowering of BUILD_VECTOR makes that nearly impossible until the old
> - // vector shuffle lowering is dead.
> - if (SDValue V2S = getScalarValueForVectorElement(
> - V2, Mask[V2Index] - Mask.size(), DAG)) {
> - // We need to zext the scalar if it is smaller than an i32.
> - V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
> - if (EltVT == MVT::i8 || EltVT == MVT::i16) {
> - // Using zext to expand a narrow element won't work for non-zero
> - // insertions.
> - if (!IsV1Zeroable)
> - return SDValue();
> -
> - // Zero-extend directly to i32.
> - ExtVT = MVT::v4i32;
> - V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
> - }
> - V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
> - } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
> - EltVT == MVT::i16) {
> - // Either not inserting from the low element of the input or the input
> - // element size is too small to use VZEXT_MOVL to clear the high bits.
> - return SDValue();
> - }
> -
> - if (!IsV1Zeroable) {
> - // If V1 can't be treated as a zero vector we have fewer options to lower
> - // this. We can't support integer vectors or non-zero targets cheaply, and
> - // the V1 elements can't be permuted in any way.
> - assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
> - if (!VT.isFloatingPoint() || V2Index != 0)
> - return SDValue();
> - SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
> - V1Mask[V2Index] = -1;
> - if (!isNoopShuffleMask(V1Mask))
> - return SDValue();
> - // This is essentially a special case blend operation, but if we have
> - // general purpose blend operations, they are always faster. Bail and let
> - // the rest of the lowering handle these as blends.
> - if (Subtarget->hasSSE41())
> - return SDValue();
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // Otherwise, use MOVSD or MOVSS.
> - assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
> - "Only two types of floating point element types to handle!");
> - return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
> - ExtVT, V1, V2);
> - }
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
> + return Shift;
>
> - // This lowering only works for the low element with floating point vectors.
> - if (VT.isFloatingPoint() && V2Index != 0)
> - return SDValue();
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
> + if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
>
> - V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
> - if (ExtVT != VT)
> - V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
> + // Try to use byte rotation instructions.
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
> + return Rotate;
>
> - if (V2Index != 0) {
> - // If we have 4 or fewer lanes we can cheaply shuffle the element into
> - // the desired position. Otherwise it is more efficient to do a vector
> - // shift left. We know that we can do a vector shift left because all
> - // the inputs are zero.
> - if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
> - SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
> - V2Shuffle[V2Index] = 0;
> - V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
> - } else {
> - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
> - V2 = DAG.getNode(
> - X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
> - DAG.getConstant(
> - V2Index * EltVT.getSizeInBits()/8,
> - DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
> - V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
> - }
> - }
> - return V2;
> -}
> + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
> + // such inputs we can swap two of the dwords across the half mark and end up
> + // with <=2 inputs to each half in each half. Once there, we can fall through
> + // to the generic code below. For example:
> + //
> + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
> + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
> + //
> + // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
> + // and an existing 2-into-2 on the other half. In this case we may have to
> + // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
> + // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
> + // Fortunately, we don't have to handle anything but a 2-into-2 pattern
> + // because any other situation (including a 3-into-1 or 1-into-3 in the other
> + // half than the one we target for fixing) will be fixed when we re-enter this
> + // path. We will also combine away any sequence of PSHUFD instructions that
> + // result into a single instruction. Here is an example of the tricky case:
> + //
> + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
> + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
> + //
> + // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
> + //
> + // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
> + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
> + //
> + // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
> + // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
> + //
> + // The result is fine to be handled by the generic logic.
> + auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
> + ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
> + int AOffset, int BOffset) {
> + assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
> + "Must call this with A having 3 or 1 inputs from the A half.");
> + assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
> + "Must call this with B having 1 or 3 inputs from the B half.");
> + assert(AToAInputs.size() + BToAInputs.size() == 4 &&
> + "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
>
> -/// \brief Try to lower broadcast of a single element.
> -///
> -/// For convenience, this code also bundles all of the subtarget feature set
> -/// filtering. While a little annoying to re-dispatch on type here, there isn't
> -/// a convenient way to factor it out.
> -static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
> - ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - if (!Subtarget->hasAVX())
> - return SDValue();
> - if (VT.isInteger() && !Subtarget->hasAVX2())
> - return SDValue();
> + // Compute the index of dword with only one word among the three inputs in
> + // a half by taking the sum of the half with three inputs and subtracting
> + // the sum of the actual three inputs. The difference is the remaining
> + // slot.
> + int ADWord, BDWord;
> + int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
> + int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
> + int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
> + ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
> + int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
> + int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
> + int TripleNonInputIdx =
> + TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
> + TripleDWord = TripleNonInputIdx / 2;
>
> - // Check that the mask is a broadcast.
> - int BroadcastIdx = -1;
> - for (int M : Mask)
> - if (M >= 0 && BroadcastIdx == -1)
> - BroadcastIdx = M;
> - else if (M >= 0 && M != BroadcastIdx)
> - return SDValue();
> + // We use xor with one to compute the adjacent DWord to whichever one the
> + // OneInput is in.
> + OneInputDWord = (OneInput / 2) ^ 1;
>
> - assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
> - "a sorted mask where the broadcast "
> - "comes from V1.");
> + // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
> + // and BToA inputs. If there is also such a problem with the BToB and AToB
> + // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
> + // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
> + // is essential that we don't *create* a 3<-1 as then we might oscillate.
> + if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
> + // Compute how many inputs will be flipped by swapping these DWords. We
> + // need
> + // to balance this to ensure we don't form a 3-1 shuffle in the other
> + // half.
> + int NumFlippedAToBInputs =
> + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
> + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
> + int NumFlippedBToBInputs =
> + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
> + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
> + if ((NumFlippedAToBInputs == 1 &&
> + (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
> + (NumFlippedBToBInputs == 1 &&
> + (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
> + // We choose whether to fix the A half or B half based on whether that
> + // half has zero flipped inputs. At zero, we may not be able to fix it
> + // with that half. We also bias towards fixing the B half because that
> + // will more commonly be the high half, and we have to bias one way.
> + auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
> + ArrayRef<int> Inputs) {
> + int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
> + bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
> + PinnedIdx ^ 1) != Inputs.end();
> + // Determine whether the free index is in the flipped dword or the
> + // unflipped dword based on where the pinned index is. We use this bit
> + // in an xor to conditionally select the adjacent dword.
> + int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
> + bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
> + FixFreeIdx) != Inputs.end();
> + if (IsFixIdxInput == IsFixFreeIdxInput)
> + FixFreeIdx += 1;
> + IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
> + FixFreeIdx) != Inputs.end();
> + assert(IsFixIdxInput != IsFixFreeIdxInput &&
> + "We need to be changing the number of flipped inputs!");
> + int PSHUFHalfMask[] = {0, 1, 2, 3};
> + std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
> + V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
> + MVT::v8i16, V,
> + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
>
> - // Go up the chain of (vector) values to try and find a scalar load that
> - // we can combine with the broadcast.
> - for (;;) {
> - switch (V.getOpcode()) {
> - case ISD::CONCAT_VECTORS: {
> - int OperandSize = Mask.size() / V.getNumOperands();
> - V = V.getOperand(BroadcastIdx / OperandSize);
> - BroadcastIdx %= OperandSize;
> - continue;
> + for (int &M : Mask)
> + if (M != -1 && M == FixIdx)
> + M = FixFreeIdx;
> + else if (M != -1 && M == FixFreeIdx)
> + M = FixIdx;
> + };
> + if (NumFlippedBToBInputs != 0) {
> + int BPinnedIdx =
> + BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
> + FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
> + } else {
> + assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
> + int APinnedIdx =
> + AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
> + FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
> + }
> + }
> }
>
> - case ISD::INSERT_SUBVECTOR: {
> - SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
> - auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
> - if (!ConstantIdx)
> - break;
> + int PSHUFDMask[] = {0, 1, 2, 3};
> + PSHUFDMask[ADWord] = BDWord;
> + PSHUFDMask[BDWord] = ADWord;
> + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
> + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
>
> - int BeginIdx = (int)ConstantIdx->getZExtValue();
> - int EndIdx =
> - BeginIdx + (int)VInner.getValueType().getVectorNumElements();
> - if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
> - BroadcastIdx -= BeginIdx;
> - V = VInner;
> - } else {
> - V = VOuter;
> - }
> - continue;
> - }
> - }
> - break;
> - }
> -
> - // Check if this is a broadcast of a scalar. We special case lowering
> - // for scalars so that we can more effectively fold with loads.
> - if (V.getOpcode() == ISD::BUILD_VECTOR ||
> - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
> - V = V.getOperand(BroadcastIdx);
> -
> - // If the scalar isn't a load we can't broadcast from it in AVX1, only with
> - // AVX2.
> - if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
> - return SDValue();
> - } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
> - // We can't broadcast from a vector register w/o AVX2, and we can only
> - // broadcast from the zero-element of a vector register.
> - return SDValue();
> - }
> -
> - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
> -}
> -
> -// Check for whether we can use INSERTPS to perform the shuffle. We only use
> -// INSERTPS when the V1 elements are already in the correct locations
> -// because otherwise we can just always use two SHUFPS instructions which
> -// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
> -// perform INSERTPS if a single V1 element is out of place and all V2
> -// elements are zeroable.
> -static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
> - ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> + // Adjust the mask to match the new locations of A and B.
> + for (int &M : Mask)
> + if (M != -1 && M/2 == ADWord)
> + M = 2 * BDWord + M % 2;
> + else if (M != -1 && M/2 == BDWord)
> + M = 2 * ADWord + M % 2;
>
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> + // Recurse back into this routine to re-compute state now that this isn't
> + // a 3 and 1 problem.
> + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
> + Mask);
> + };
> + if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
> + return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
> + else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
> + return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
>
> - unsigned ZMask = 0;
> - int V1DstIndex = -1;
> - int V2DstIndex = -1;
> - bool V1UsedInPlace = false;
> + // At this point there are at most two inputs to the low and high halves from
> + // each half. That means the inputs can always be grouped into dwords and
> + // those dwords can then be moved to the correct half with a dword shuffle.
> + // We use at most one low and one high word shuffle to collect these paired
> + // inputs into dwords, and finally a dword shuffle to place them.
> + int PSHUFLMask[4] = {-1, -1, -1, -1};
> + int PSHUFHMask[4] = {-1, -1, -1, -1};
> + int PSHUFDMask[4] = {-1, -1, -1, -1};
>
> - for (int i = 0; i < 4; ++i) {
> - // Synthesize a zero mask from the zeroable elements (includes undefs).
> - if (Zeroable[i]) {
> - ZMask |= 1 << i;
> - continue;
> + // First fix the masks for all the inputs that are staying in their
> + // original halves. This will then dictate the targets of the cross-half
> + // shuffles.
> + auto fixInPlaceInputs =
> + [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
> + MutableArrayRef<int> SourceHalfMask,
> + MutableArrayRef<int> HalfMask, int HalfOffset) {
> + if (InPlaceInputs.empty())
> + return;
> + if (InPlaceInputs.size() == 1) {
> + SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
> + InPlaceInputs[0] - HalfOffset;
> + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
> + return;
> }
> -
> - // Flag if we use any V1 inputs in place.
> - if (i == Mask[i]) {
> - V1UsedInPlace = true;
> - continue;
> + if (IncomingInputs.empty()) {
> + // Just fix all of the in place inputs.
> + for (int Input : InPlaceInputs) {
> + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
> + PSHUFDMask[Input / 2] = Input / 2;
> + }
> + return;
> }
>
> - // We can only insert a single non-zeroable element.
> - if (V1DstIndex != -1 || V2DstIndex != -1)
> - return SDValue();
> + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
> + SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
> + InPlaceInputs[0] - HalfOffset;
> + // Put the second input next to the first so that they are packed into
> + // a dword. We find the adjacent index by toggling the low bit.
> + int AdjIndex = InPlaceInputs[0] ^ 1;
> + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
> + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
> + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
> + };
> + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
> + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
>
> - if (Mask[i] < 4) {
> - // V1 input out of place for insertion.
> - V1DstIndex = i;
> - } else {
> - // V2 input for insertion.
> - V2DstIndex = i;
> - }
> - }
> + // Now gather the cross-half inputs and place them into a free dword of
> + // their target half.
> + // FIXME: This operation could almost certainly be simplified dramatically to
> + // look more like the 3-1 fixing operation.
> + auto moveInputsToRightHalf = [&PSHUFDMask](
> + MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
> + MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
> + MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
> + int DestOffset) {
> + auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
> + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
> + };
> + auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
> + int Word) {
> + int LowWord = Word & ~1;
> + int HighWord = Word | 1;
> + return isWordClobbered(SourceHalfMask, LowWord) ||
> + isWordClobbered(SourceHalfMask, HighWord);
> + };
>
> - // Don't bother if we have no (non-zeroable) element for insertion.
> - if (V1DstIndex == -1 && V2DstIndex == -1)
> - return SDValue();
> + if (IncomingInputs.empty())
> + return;
>
> - // Determine element insertion src/dst indices. The src index is from the
> - // start of the inserted vector, not the start of the concatenated vector.
> - unsigned V2SrcIndex = 0;
> - if (V1DstIndex != -1) {
> - // If we have a V1 input out of place, we use V1 as the V2 element insertion
> - // and don't use the original V2 at all.
> - V2SrcIndex = Mask[V1DstIndex];
> - V2DstIndex = V1DstIndex;
> - V2 = V1;
> - } else {
> - V2SrcIndex = Mask[V2DstIndex] - 4;
> - }
> + if (ExistingInputs.empty()) {
> + // Map any dwords with inputs from them into the right half.
> + for (int Input : IncomingInputs) {
> + // If the source half mask maps over the inputs, turn those into
> + // swaps and use the swapped lane.
> + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
> + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
> + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
> + Input - SourceOffset;
> + // We have to swap the uses in our half mask in one sweep.
> + for (int &M : HalfMask)
> + if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
> + M = Input;
> + else if (M == Input)
> + M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
> + } else {
> + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
> + Input - SourceOffset &&
> + "Previous placement doesn't match!");
> + }
> + // Note that this correctly re-maps both when we do a swap and when
> + // we observe the other side of the swap above. We rely on that to
> + // avoid swapping the members of the input list directly.
> + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
> + }
>
> - // If no V1 inputs are used in place, then the result is created only from
> - // the zero mask and the V2 insertion - so remove V1 dependency.
> - if (!V1UsedInPlace)
> - V1 = DAG.getUNDEF(MVT::v4f32);
> + // Map the input's dword into the correct half.
> + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
> + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
> + else
> + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
> + Input / 2 &&
> + "Previous placement doesn't match!");
> + }
>
> - unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
> - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
> + // And just directly shift any other-half mask elements to be same-half
> + // as we will have mirrored the dword containing the element into the
> + // same position within that half.
> + for (int &M : HalfMask)
> + if (M >= SourceOffset && M < SourceOffset + 4) {
> + M = M - SourceOffset + DestOffset;
> + assert(M >= 0 && "This should never wrap below zero!");
> + }
> + return;
> + }
>
> - // Insert the V2 element into the desired position.
> - SDLoc DL(Op);
> - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
> - DAG.getConstant(InsertPSMask, MVT::i8));
> -}
> + // Ensure we have the input in a viable dword of its current half. This
> + // is particularly tricky because the original position may be clobbered
> + // by inputs being moved and *staying* in that half.
> + if (IncomingInputs.size() == 1) {
> + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
> + int InputFixed = std::find(std::begin(SourceHalfMask),
> + std::end(SourceHalfMask), -1) -
> + std::begin(SourceHalfMask) + SourceOffset;
> + SourceHalfMask[InputFixed - SourceOffset] =
> + IncomingInputs[0] - SourceOffset;
> + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
> + InputFixed);
> + IncomingInputs[0] = InputFixed;
> + }
> + } else if (IncomingInputs.size() == 2) {
> + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
> + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
> + // We have two non-adjacent or clobbered inputs we need to extract from
> + // the source half. To do this, we need to map them into some adjacent
> + // dword slot in the source mask.
> + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
> + IncomingInputs[1] - SourceOffset};
>
> -/// \brief Try to lower a shuffle as a permute of the inputs followed by an
> -/// UNPCK instruction.
> -///
> -/// This specifically targets cases where we end up with alternating between
> -/// the two inputs, and so can permute them into something that feeds a single
> -/// UNPCK instruction. Note that this routine only targets integer vectors
> -/// because for floating point vectors we have a generalized SHUFPS lowering
> -/// strategy that handles everything that doesn't *exactly* match an unpack,
> -/// making this clever lowering unnecessary.
> -static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - assert(!VT.isFloatingPoint() &&
> - "This routine only supports integer vectors.");
> - assert(!isSingleInputShuffleMask(Mask) &&
> - "This routine should only be used when blending two inputs.");
> - assert(Mask.size() >= 2 && "Single element masks are invalid.");
> -
> - int Size = Mask.size();
> -
> - int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
> - return M >= 0 && M % Size < Size / 2;
> - });
> - int NumHiInputs = std::count_if(
> - Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
> -
> - bool UnpackLo = NumLoInputs >= NumHiInputs;
> -
> - auto TryUnpack = [&](MVT UnpackVT, int Scale) {
> - SmallVector<int, 32> V1Mask(Mask.size(), -1);
> - SmallVector<int, 32> V2Mask(Mask.size(), -1);
> -
> - for (int i = 0; i < Size; ++i) {
> - if (Mask[i] < 0)
> - continue;
> -
> - // Each element of the unpack contains Scale elements from this mask.
> - int UnpackIdx = i / Scale;
> -
> - // We only handle the case where V1 feeds the first slots of the unpack.
> - // We rely on canonicalization to ensure this is the case.
> - if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
> - return SDValue();
> -
> - // Setup the mask for this input. The indexing is tricky as we have to
> - // handle the unpack stride.
> - SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
> - VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
> - Mask[i] % Size;
> - }
> -
> - // If we will have to shuffle both inputs to use the unpack, check whether
> - // we can just unpack first and shuffle the result. If so, skip this unpack.
> - if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
> - !isNoopShuffleMask(V2Mask))
> - return SDValue();
> -
> - // Shuffle the inputs into place.
> - V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
> - V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
> -
> - // Cast the inputs to the type we will use to unpack them.
> - V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
> -
> - // Unpack the inputs and cast the result back to the desired type.
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
> - DL, UnpackVT, V1, V2));
> - };
> -
> - // We try each unpack from the largest to the smallest to try and find one
> - // that fits this mask.
> - int OrigNumElements = VT.getVectorNumElements();
> - int OrigScalarSize = VT.getScalarSizeInBits();
> - for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
> - int Scale = ScalarSize / OrigScalarSize;
> - int NumElements = OrigNumElements / Scale;
> - MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
> - if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
> - return Unpack;
> - }
> -
> - // If none of the unpack-rooted lowerings worked (or were profitable) try an
> - // initial unpack.
> - if (NumLoInputs == 0 || NumHiInputs == 0) {
> - assert((NumLoInputs > 0 || NumHiInputs > 0) &&
> - "We have to have *some* inputs!");
> - int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
> -
> - // FIXME: We could consider the total complexity of the permute of each
> - // possible unpacking. Or at the least we should consider how many
> - // half-crossings are created.
> - // FIXME: We could consider commuting the unpacks.
> -
> - SmallVector<int, 32> PermMask;
> - PermMask.assign(Size, -1);
> - for (int i = 0; i < Size; ++i) {
> - if (Mask[i] < 0)
> - continue;
> -
> - assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
> -
> - PermMask[i] =
> - 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
> - }
> - return DAG.getVectorShuffle(
> - VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
> - DL, VT, V1, V2),
> - DAG.getUNDEF(VT), PermMask);
> - }
> -
> - return SDValue();
> -}
> -
> -/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
> -///
> -/// This is the basis function for the 2-lane 64-bit shuffles as we have full
> -/// support for floating point shuffles but not integer shuffles. These
> -/// instructions will incur a domain crossing penalty on some chips though so
> -/// it is better to avoid lowering through this for integer vectors where
> -/// possible.
> -static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
> -
> - if (isSingleInputShuffleMask(Mask)) {
> - // Use low duplicate instructions for masks that match their pattern.
> - if (Subtarget->hasSSE3())
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
> - return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
> -
> - // Straight shuffle of a single input vector. Simulate this by using the
> - // single input as both of the "inputs" to this instruction..
> - unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
> -
> - if (Subtarget->hasAVX()) {
> - // If we have AVX, we can use VPERMILPS which will allow folding a load
> - // into the shuffle.
> - return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
> - DAG.getConstant(SHUFPDMask, MVT::i8));
> - }
> -
> - return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
> - DAG.getConstant(SHUFPDMask, MVT::i8));
> - }
> - assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
> - assert(Mask[1] >= 2 && "Non-canonicalized blend!");
> -
> - // If we have a single input, insert that into V1 if we can do so cheaply.
> - if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
> - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> - MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
> - return Insertion;
> - // Try inverting the insertion since for v2 masks it is easy to do and we
> - // can't reliably sort the mask one way or the other.
> - int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
> - Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
> - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> - MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
> - return Insertion;
> - }
> -
> - // Try to use one of the special instruction patterns to handle two common
> - // blend patterns if a zero-blend above didn't work.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
> - if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
> - // We can either use a special instruction to load over the low double or
> - // to move just the low double.
> - return DAG.getNode(
> - isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
> - DL, MVT::v2f64, V2,
> - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
> -
> - if (Subtarget->hasSSE41())
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
> -
> - unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
> - return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
> - DAG.getConstant(SHUFPDMask, MVT::i8));
> -}
> -
> -/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
> -///
> -/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
> -/// the integer unit to minimize domain crossing penalties. However, for blends
> -/// it falls back to the floating point shuffle operation with appropriate bit
> -/// casting.
> -static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
> -
> - if (isSingleInputShuffleMask(Mask)) {
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Straight shuffle of a single input vector. For everything from SSE2
> - // onward this has a single fast instruction with no scary immediates.
> - // We have to map the mask as it is actually a v4i32 shuffle instruction.
> - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
> - int WidenedMask[4] = {
> - std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
> - std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
> - return DAG.getNode(
> - ISD::BITCAST, DL, MVT::v2i64,
> - DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
> - getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
> - }
> - assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
> - assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
> - assert(Mask[0] < 2 && "We sort V1 to be the first input.");
> - assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
> -
> - // If we have a blend of two PACKUS operations an the blend aligns with the
> - // low and half halves, we can just merge the PACKUS operations. This is
> - // particularly important as it lets us merge shuffles that this routine itself
> - // creates.
> - auto GetPackNode = [](SDValue V) {
> - while (V.getOpcode() == ISD::BITCAST)
> - V = V.getOperand(0);
> -
> - return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
> - };
> - if (SDValue V1Pack = GetPackNode(V1))
> - if (SDValue V2Pack = GetPackNode(V2))
> - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
> - DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
> - Mask[0] == 0 ? V1Pack.getOperand(0)
> - : V1Pack.getOperand(1),
> - Mask[1] == 2 ? V2Pack.getOperand(0)
> - : V2Pack.getOperand(1)));
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
> - return Shift;
> -
> - // When loading a scalar and then shuffling it into a vector we can often do
> - // the insertion cheaply.
> - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> - MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
> - return Insertion;
> - // Try inverting the insertion since for v2 masks it is easy to do and we
> - // can't reliably sort the mask one way or the other.
> - int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
> - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> - MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
> - return Insertion;
> -
> - // We have different paths for blend lowering, but they all must use the
> - // *exact* same predicate.
> - bool IsBlendSupported = Subtarget->hasSSE41();
> - if (IsBlendSupported)
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
> -
> - // Try to use byte rotation instructions.
> - // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
> - if (Subtarget->hasSSSE3())
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> -
> - // If we have direct support for blends, we should lower by decomposing into
> - // a permute. That will be faster than the domain cross.
> - if (IsBlendSupported)
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
> - Mask, DAG);
> -
> - // We implement this with SHUFPD which is pretty lame because it will likely
> - // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
> - // However, all the alternatives are still more cycles and newer chips don't
> - // have this problem. It would be really nice if x86 had better shuffles here.
> - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
> - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
> - DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
> -}
> -
> -/// \brief Test whether this can be lowered with a single SHUFPS instruction.
> -///
> -/// This is used to disable more specialized lowerings when the shufps lowering
> -/// will happen to be efficient.
> -static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
> - // This routine only handles 128-bit shufps.
> - assert(Mask.size() == 4 && "Unsupported mask size!");
> -
> - // To lower with a single SHUFPS we need to have the low half and high half
> - // each requiring a single input.
> - if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
> - return false;
> - if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
> - return false;
> -
> - return true;
> -}
> -
> -/// \brief Lower a vector shuffle using the SHUFPS instruction.
> -///
> -/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
> -/// It makes no assumptions about whether this is the *best* lowering, it simply
> -/// uses it.
> -static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
> - ArrayRef<int> Mask, SDValue V1,
> - SDValue V2, SelectionDAG &DAG) {
> - SDValue LowV = V1, HighV = V2;
> - int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
> -
> - int NumV2Elements =
> - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
> -
> - if (NumV2Elements == 1) {
> - int V2Index =
> - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
> - Mask.begin();
> -
> - // Compute the index adjacent to V2Index and in the same half by toggling
> - // the low bit.
> - int V2AdjIndex = V2Index ^ 1;
> -
> - if (Mask[V2AdjIndex] == -1) {
> - // Handles all the cases where we have a single V2 element and an undef.
> - // This will only ever happen in the high lanes because we commute the
> - // vector otherwise.
> - if (V2Index < 2)
> - std::swap(LowV, HighV);
> - NewMask[V2Index] -= 4;
> - } else {
> - // Handle the case where the V2 element ends up adjacent to a V1 element.
> - // To make this work, blend them together as the first step.
> - int V1Index = V2AdjIndex;
> - int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
> - V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
> - getV4X86ShuffleImm8ForMask(BlendMask, DAG));
> -
> - // Now proceed to reconstruct the final blend as we have the necessary
> - // high or low half formed.
> - if (V2Index < 2) {
> - LowV = V2;
> - HighV = V1;
> - } else {
> - HighV = V2;
> - }
> - NewMask[V1Index] = 2; // We put the V1 element in V2[2].
> - NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
> - }
> - } else if (NumV2Elements == 2) {
> - if (Mask[0] < 4 && Mask[1] < 4) {
> - // Handle the easy case where we have V1 in the low lanes and V2 in the
> - // high lanes.
> - NewMask[2] -= 4;
> - NewMask[3] -= 4;
> - } else if (Mask[2] < 4 && Mask[3] < 4) {
> - // We also handle the reversed case because this utility may get called
> - // when we detect a SHUFPS pattern but can't easily commute the shuffle to
> - // arrange things in the right direction.
> - NewMask[0] -= 4;
> - NewMask[1] -= 4;
> - HighV = V1;
> - LowV = V2;
> - } else {
> - // We have a mixture of V1 and V2 in both low and high lanes. Rather than
> - // trying to place elements directly, just blend them and set up the final
> - // shuffle to place them.
> -
> - // The first two blend mask elements are for V1, the second two are for
> - // V2.
> - int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
> - Mask[2] < 4 ? Mask[2] : Mask[3],
> - (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
> - (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
> - V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
> - getV4X86ShuffleImm8ForMask(BlendMask, DAG));
> -
> - // Now we do a normal shuffle of V1 by giving V1 as both operands to
> - // a blend.
> - LowV = HighV = V1;
> - NewMask[0] = Mask[0] < 4 ? 0 : 2;
> - NewMask[1] = Mask[0] < 4 ? 2 : 0;
> - NewMask[2] = Mask[2] < 4 ? 1 : 3;
> - NewMask[3] = Mask[2] < 4 ? 3 : 1;
> - }
> - }
> - return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
> - getV4X86ShuffleImm8ForMask(NewMask, DAG));
> -}
> -
> -/// \brief Lower 4-lane 32-bit floating point shuffles.
> -///
> -/// Uses instructions exclusively from the floating point unit to minimize
> -/// domain crossing penalties, as these are sufficient to implement all v4f32
> -/// shuffles.
> -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> -
> - int NumV2Elements =
> - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
> -
> - if (NumV2Elements == 0) {
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Use even/odd duplicate instructions for masks that match their pattern.
> - if (Subtarget->hasSSE3()) {
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
> - return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
> - return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
> - }
> -
> - if (Subtarget->hasAVX()) {
> - // If we have AVX, we can use VPERMILPS which will allow folding a load
> - // into the shuffle.
> - return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
> - getV4X86ShuffleImm8ForMask(Mask, DAG));
> - }
> -
> - // Otherwise, use a straight shuffle of a single input vector. We pass the
> - // input vector to both operands to simulate this with a SHUFPS.
> - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
> - getV4X86ShuffleImm8ForMask(Mask, DAG));
> - }
> -
> - // There are special ways we can lower some single-element blends. However, we
> - // have custom ways we can lower more complex single-element blends below that
> - // we defer to if both this and BLENDPS fail to match, so restrict this to
> - // when the V2 input is targeting element 0 of the mask -- that is the fast
> - // case here.
> - if (NumV2Elements == 1 && Mask[0] >= 4)
> - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
> - Mask, Subtarget, DAG))
> - return V;
> -
> - if (Subtarget->hasSSE41()) {
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - // Use INSERTPS if we can complete the shuffle efficiently.
> - if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
> - return V;
> -
> - if (!isSingleSHUFPSMask(Mask))
> - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
> - DL, MVT::v4f32, V1, V2, Mask, DAG))
> - return BlendPerm;
> - }
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
> -
> - // Otherwise fall back to a SHUFPS lowering strategy.
> - return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
> -}
> -
> -/// \brief Lower 4-lane i32 vector shuffles.
> -///
> -/// We try to handle these with integer-domain shuffles where we can, but for
> -/// blends we use the floating point domain blend instructions.
> -static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> -
> - // Whenever we can lower this as a zext, that instruction is strictly faster
> - // than any alternative. It also allows us to fold memory operands into the
> - // shuffle in many cases.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
> - Mask, Subtarget, DAG))
> - return ZExt;
> -
> - int NumV2Elements =
> - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
> -
> - if (NumV2Elements == 0) {
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Straight shuffle of a single input vector. For everything from SSE2
> - // onward this has a single fast instruction with no scary immediates.
> - // We coerce the shuffle pattern to be compatible with UNPCK instructions
> - // but we aren't actually going to use the UNPCK instruction because doing
> - // so prevents folding a load into this instruction or making a copy.
> - const int UnpackLoMask[] = {0, 0, 1, 1};
> - const int UnpackHiMask[] = {2, 2, 3, 3};
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
> - Mask = UnpackLoMask;
> - else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
> - Mask = UnpackHiMask;
> -
> - return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
> - getV4X86ShuffleImm8ForMask(Mask, DAG));
> - }
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
> - return Shift;
> -
> - // There are special ways we can lower some single-element blends.
> - if (NumV2Elements == 1)
> - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
> - Mask, Subtarget, DAG))
> - return V;
> -
> - // We have different paths for blend lowering, but they all must use the
> - // *exact* same predicate.
> - bool IsBlendSupported = Subtarget->hasSSE41();
> - if (IsBlendSupported)
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - if (SDValue Masked =
> - lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
> - return Masked;
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
> -
> - // Try to use byte rotation instructions.
> - // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
> - if (Subtarget->hasSSSE3())
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> -
> - // If we have direct support for blends, we should lower by decomposing into
> - // a permute. That will be faster than the domain cross.
> - if (IsBlendSupported)
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
> - Mask, DAG);
> -
> - // Try to lower by permuting the inputs into an unpack instruction.
> - if (SDValue Unpack =
> - lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
> - return Unpack;
> -
> - // We implement this with SHUFPS because it can blend from two vectors.
> - // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
> - // up the inputs, bypassing domain shift penalties that we would encur if we
> - // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
> - // relevant.
> - return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
> - DAG.getVectorShuffle(
> - MVT::v4f32, DL,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
> -}
> -
> -/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
> -/// shuffle lowering, and the most complex part.
> -///
> -/// The lowering strategy is to try to form pairs of input lanes which are
> -/// targeted at the same half of the final vector, and then use a dword shuffle
> -/// to place them onto the right half, and finally unpack the paired lanes into
> -/// their final position.
> -///
> -/// The exact breakdown of how to form these dword pairs and align them on the
> -/// correct sides is really tricky. See the comments within the function for
> -/// more of the details.
> -static SDValue lowerV8I16SingleInputVectorShuffle(
> - SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
> - const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> - assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
> - MutableArrayRef<int> LoMask = Mask.slice(0, 4);
> - MutableArrayRef<int> HiMask = Mask.slice(4, 4);
> -
> - SmallVector<int, 4> LoInputs;
> - std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
> - [](int M) { return M >= 0; });
> - std::sort(LoInputs.begin(), LoInputs.end());
> - LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
> - SmallVector<int, 4> HiInputs;
> - std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
> - [](int M) { return M >= 0; });
> - std::sort(HiInputs.begin(), HiInputs.end());
> - HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
> - int NumLToL =
> - std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
> - int NumHToL = LoInputs.size() - NumLToL;
> - int NumLToH =
> - std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
> - int NumHToH = HiInputs.size() - NumLToH;
> - MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
> - MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
> - MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
> - MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
> -
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
> - return Shift;
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
> - if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
> -
> - // Try to use byte rotation instructions.
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
> - return Rotate;
> -
> - // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
> - // such inputs we can swap two of the dwords across the half mark and end up
> - // with <=2 inputs to each half in each half. Once there, we can fall through
> - // to the generic code below. For example:
> - //
> - // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
> - // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
> - //
> - // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
> - // and an existing 2-into-2 on the other half. In this case we may have to
> - // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
> - // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
> - // Fortunately, we don't have to handle anything but a 2-into-2 pattern
> - // because any other situation (including a 3-into-1 or 1-into-3 in the other
> - // half than the one we target for fixing) will be fixed when we re-enter this
> - // path. We will also combine away any sequence of PSHUFD instructions that
> - // result into a single instruction. Here is an example of the tricky case:
> - //
> - // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
> - // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
> - //
> - // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
> - //
> - // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
> - // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
> - //
> - // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
> - // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
> - //
> - // The result is fine to be handled by the generic logic.
> - auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
> - ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
> - int AOffset, int BOffset) {
> - assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
> - "Must call this with A having 3 or 1 inputs from the A half.");
> - assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
> - "Must call this with B having 1 or 3 inputs from the B half.");
> - assert(AToAInputs.size() + BToAInputs.size() == 4 &&
> - "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
> -
> - // Compute the index of dword with only one word among the three inputs in
> - // a half by taking the sum of the half with three inputs and subtracting
> - // the sum of the actual three inputs. The difference is the remaining
> - // slot.
> - int ADWord, BDWord;
> - int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
> - int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
> - int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
> - ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
> - int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
> - int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
> - int TripleNonInputIdx =
> - TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
> - TripleDWord = TripleNonInputIdx / 2;
> -
> - // We use xor with one to compute the adjacent DWord to whichever one the
> - // OneInput is in.
> - OneInputDWord = (OneInput / 2) ^ 1;
> -
> - // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
> - // and BToA inputs. If there is also such a problem with the BToB and AToB
> - // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
> - // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
> - // is essential that we don't *create* a 3<-1 as then we might oscillate.
> - if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
> - // Compute how many inputs will be flipped by swapping these DWords. We
> - // need
> - // to balance this to ensure we don't form a 3-1 shuffle in the other
> - // half.
> - int NumFlippedAToBInputs =
> - std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
> - std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
> - int NumFlippedBToBInputs =
> - std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
> - std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
> - if ((NumFlippedAToBInputs == 1 &&
> - (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
> - (NumFlippedBToBInputs == 1 &&
> - (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
> - // We choose whether to fix the A half or B half based on whether that
> - // half has zero flipped inputs. At zero, we may not be able to fix it
> - // with that half. We also bias towards fixing the B half because that
> - // will more commonly be the high half, and we have to bias one way.
> - auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
> - ArrayRef<int> Inputs) {
> - int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
> - bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
> - PinnedIdx ^ 1) != Inputs.end();
> - // Determine whether the free index is in the flipped dword or the
> - // unflipped dword based on where the pinned index is. We use this bit
> - // in an xor to conditionally select the adjacent dword.
> - int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
> - bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
> - FixFreeIdx) != Inputs.end();
> - if (IsFixIdxInput == IsFixFreeIdxInput)
> - FixFreeIdx += 1;
> - IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
> - FixFreeIdx) != Inputs.end();
> - assert(IsFixIdxInput != IsFixFreeIdxInput &&
> - "We need to be changing the number of flipped inputs!");
> - int PSHUFHalfMask[] = {0, 1, 2, 3};
> - std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
> - V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
> - MVT::v8i16, V,
> - getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
> -
> - for (int &M : Mask)
> - if (M != -1 && M == FixIdx)
> - M = FixFreeIdx;
> - else if (M != -1 && M == FixFreeIdx)
> - M = FixIdx;
> - };
> - if (NumFlippedBToBInputs != 0) {
> - int BPinnedIdx =
> - BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
> - FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
> - } else {
> - assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
> - int APinnedIdx =
> - AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
> - FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
> - }
> - }
> - }
> -
> - int PSHUFDMask[] = {0, 1, 2, 3};
> - PSHUFDMask[ADWord] = BDWord;
> - PSHUFDMask[BDWord] = ADWord;
> - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
> - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> -
> - // Adjust the mask to match the new locations of A and B.
> - for (int &M : Mask)
> - if (M != -1 && M/2 == ADWord)
> - M = 2 * BDWord + M % 2;
> - else if (M != -1 && M/2 == BDWord)
> - M = 2 * ADWord + M % 2;
> -
> - // Recurse back into this routine to re-compute state now that this isn't
> - // a 3 and 1 problem.
> - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
> - Mask);
> - };
> - if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
> - return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
> - else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
> - return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
> -
> - // At this point there are at most two inputs to the low and high halves from
> - // each half. That means the inputs can always be grouped into dwords and
> - // those dwords can then be moved to the correct half with a dword shuffle.
> - // We use at most one low and one high word shuffle to collect these paired
> - // inputs into dwords, and finally a dword shuffle to place them.
> - int PSHUFLMask[4] = {-1, -1, -1, -1};
> - int PSHUFHMask[4] = {-1, -1, -1, -1};
> - int PSHUFDMask[4] = {-1, -1, -1, -1};
> -
> - // First fix the masks for all the inputs that are staying in their
> - // original halves. This will then dictate the targets of the cross-half
> - // shuffles.
> - auto fixInPlaceInputs =
> - [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
> - MutableArrayRef<int> SourceHalfMask,
> - MutableArrayRef<int> HalfMask, int HalfOffset) {
> - if (InPlaceInputs.empty())
> - return;
> - if (InPlaceInputs.size() == 1) {
> - SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
> - InPlaceInputs[0] - HalfOffset;
> - PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
> - return;
> - }
> - if (IncomingInputs.empty()) {
> - // Just fix all of the in place inputs.
> - for (int Input : InPlaceInputs) {
> - SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
> - PSHUFDMask[Input / 2] = Input / 2;
> - }
> - return;
> - }
> -
> - assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
> - SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
> - InPlaceInputs[0] - HalfOffset;
> - // Put the second input next to the first so that they are packed into
> - // a dword. We find the adjacent index by toggling the low bit.
> - int AdjIndex = InPlaceInputs[0] ^ 1;
> - SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
> - std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
> - PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
> - };
> - fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
> - fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
> -
> - // Now gather the cross-half inputs and place them into a free dword of
> - // their target half.
> - // FIXME: This operation could almost certainly be simplified dramatically to
> - // look more like the 3-1 fixing operation.
> - auto moveInputsToRightHalf = [&PSHUFDMask](
> - MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
> - MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
> - MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
> - int DestOffset) {
> - auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
> - return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
> - };
> - auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
> - int Word) {
> - int LowWord = Word & ~1;
> - int HighWord = Word | 1;
> - return isWordClobbered(SourceHalfMask, LowWord) ||
> - isWordClobbered(SourceHalfMask, HighWord);
> - };
> -
> - if (IncomingInputs.empty())
> - return;
> -
> - if (ExistingInputs.empty()) {
> - // Map any dwords with inputs from them into the right half.
> - for (int Input : IncomingInputs) {
> - // If the source half mask maps over the inputs, turn those into
> - // swaps and use the swapped lane.
> - if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
> - if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
> - SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
> - Input - SourceOffset;
> - // We have to swap the uses in our half mask in one sweep.
> - for (int &M : HalfMask)
> - if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
> - M = Input;
> - else if (M == Input)
> - M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
> - } else {
> - assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
> - Input - SourceOffset &&
> - "Previous placement doesn't match!");
> - }
> - // Note that this correctly re-maps both when we do a swap and when
> - // we observe the other side of the swap above. We rely on that to
> - // avoid swapping the members of the input list directly.
> - Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
> - }
> -
> - // Map the input's dword into the correct half.
> - if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
> - PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
> - else
> - assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
> - Input / 2 &&
> - "Previous placement doesn't match!");
> - }
> -
> - // And just directly shift any other-half mask elements to be same-half
> - // as we will have mirrored the dword containing the element into the
> - // same position within that half.
> - for (int &M : HalfMask)
> - if (M >= SourceOffset && M < SourceOffset + 4) {
> - M = M - SourceOffset + DestOffset;
> - assert(M >= 0 && "This should never wrap below zero!");
> - }
> - return;
> - }
> -
> - // Ensure we have the input in a viable dword of its current half. This
> - // is particularly tricky because the original position may be clobbered
> - // by inputs being moved and *staying* in that half.
> - if (IncomingInputs.size() == 1) {
> - if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
> - int InputFixed = std::find(std::begin(SourceHalfMask),
> - std::end(SourceHalfMask), -1) -
> - std::begin(SourceHalfMask) + SourceOffset;
> - SourceHalfMask[InputFixed - SourceOffset] =
> - IncomingInputs[0] - SourceOffset;
> - std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
> - InputFixed);
> - IncomingInputs[0] = InputFixed;
> - }
> - } else if (IncomingInputs.size() == 2) {
> - if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
> - isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
> - // We have two non-adjacent or clobbered inputs we need to extract from
> - // the source half. To do this, we need to map them into some adjacent
> - // dword slot in the source mask.
> - int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
> - IncomingInputs[1] - SourceOffset};
> -
> - // If there is a free slot in the source half mask adjacent to one of
> - // the inputs, place the other input in it. We use (Index XOR 1) to
> - // compute an adjacent index.
> - if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
> - SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
> - SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
> - SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
> - InputsFixed[1] = InputsFixed[0] ^ 1;
> - } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
> - SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
> - SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
> - SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
> - InputsFixed[0] = InputsFixed[1] ^ 1;
> - } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
> - SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
> - // The two inputs are in the same DWord but it is clobbered and the
> - // adjacent DWord isn't used at all. Move both inputs to the free
> - // slot.
> - SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
> - SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
> - InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
> - InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
> - } else {
> - // The only way we hit this point is if there is no clobbering
> - // (because there are no off-half inputs to this half) and there is no
> - // free slot adjacent to one of the inputs. In this case, we have to
> - // swap an input with a non-input.
> - for (int i = 0; i < 4; ++i)
> - assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
> - "We can't handle any clobbers here!");
> - assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
> - "Cannot have adjacent inputs here!");
> -
> - SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
> - SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
> -
> - // We also have to update the final source mask in this case because
> - // it may need to undo the above swap.
> - for (int &M : FinalSourceHalfMask)
> - if (M == (InputsFixed[0] ^ 1) + SourceOffset)
> - M = InputsFixed[1] + SourceOffset;
> - else if (M == InputsFixed[1] + SourceOffset)
> - M = (InputsFixed[0] ^ 1) + SourceOffset;
> -
> - InputsFixed[1] = InputsFixed[0] ^ 1;
> - }
> -
> - // Point everything at the fixed inputs.
> - for (int &M : HalfMask)
> - if (M == IncomingInputs[0])
> - M = InputsFixed[0] + SourceOffset;
> - else if (M == IncomingInputs[1])
> - M = InputsFixed[1] + SourceOffset;
> -
> - IncomingInputs[0] = InputsFixed[0] + SourceOffset;
> - IncomingInputs[1] = InputsFixed[1] + SourceOffset;
> - }
> - } else {
> - llvm_unreachable("Unhandled input size!");
> - }
> -
> - // Now hoist the DWord down to the right half.
> - int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
> - assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
> - PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
> - for (int &M : HalfMask)
> - for (int Input : IncomingInputs)
> - if (M == Input)
> - M = FreeDWord * 2 + Input % 2;
> - };
> - moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
> - /*SourceOffset*/ 4, /*DestOffset*/ 0);
> - moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
> - /*SourceOffset*/ 0, /*DestOffset*/ 4);
> -
> - // Now enact all the shuffles we've computed to move the inputs into their
> - // target half.
> - if (!isNoopShuffleMask(PSHUFLMask))
> - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
> - getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
> - if (!isNoopShuffleMask(PSHUFHMask))
> - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
> - getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
> - if (!isNoopShuffleMask(PSHUFDMask))
> - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
> - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> -
> - // At this point, each half should contain all its inputs, and we can then
> - // just shuffle them into their final position.
> - assert(std::count_if(LoMask.begin(), LoMask.end(),
> - [](int M) { return M >= 4; }) == 0 &&
> - "Failed to lift all the high half inputs to the low mask!");
> - assert(std::count_if(HiMask.begin(), HiMask.end(),
> - [](int M) { return M >= 0 && M < 4; }) == 0 &&
> - "Failed to lift all the low half inputs to the high mask!");
> -
> - // Do a half shuffle for the low mask.
> - if (!isNoopShuffleMask(LoMask))
> - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
> - getV4X86ShuffleImm8ForMask(LoMask, DAG));
> -
> - // Do a half shuffle with the high mask after shifting its values down.
> - for (int &M : HiMask)
> - if (M >= 0)
> - M -= 4;
> - if (!isNoopShuffleMask(HiMask))
> - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
> - getV4X86ShuffleImm8ForMask(HiMask, DAG));
> -
> - return V;
> -}
> -
> -/// \brief Helper to form a PSHUFB-based shuffle+blend.
> -static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG, bool &V1InUse,
> - bool &V2InUse) {
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> - SDValue V1Mask[16];
> - SDValue V2Mask[16];
> - V1InUse = false;
> - V2InUse = false;
> -
> - int Size = Mask.size();
> - int Scale = 16 / Size;
> - for (int i = 0; i < 16; ++i) {
> - if (Mask[i / Scale] == -1) {
> - V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
> - } else {
> - const int ZeroMask = 0x80;
> - int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
> - : ZeroMask;
> - int V2Idx = Mask[i / Scale] < Size
> - ? ZeroMask
> - : (Mask[i / Scale] - Size) * Scale + i % Scale;
> - if (Zeroable[i / Scale])
> - V1Idx = V2Idx = ZeroMask;
> - V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
> - V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
> - V1InUse |= (ZeroMask != V1Idx);
> - V2InUse |= (ZeroMask != V2Idx);
> - }
> - }
> -
> - if (V1InUse)
> - V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
> - if (V2InUse)
> - V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
> -
> - // If we need shuffled inputs from both, blend the two.
> - SDValue V;
> - if (V1InUse && V2InUse)
> - V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
> - else
> - V = V1InUse ? V1 : V2;
> -
> - // Cast the result back to the correct type.
> - return DAG.getNode(ISD::BITCAST, DL, VT, V);
> -}
> -
> -/// \brief Generic lowering of 8-lane i16 shuffles.
> -///
> -/// This handles both single-input shuffles and combined shuffle/blends with
> -/// two inputs. The single input shuffles are immediately delegated to
> -/// a dedicated lowering routine.
> -///
> -/// The blends are lowered in one of three fundamental ways. If there are few
> -/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
> -/// of the input is significantly cheaper when lowered as an interleaving of
> -/// the two inputs, try to interleave them. Otherwise, blend the low and high
> -/// halves of the inputs separately (making them have relatively few inputs)
> -/// and then concatenate them.
> -static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> OrigMask = SVOp->getMask();
> - int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
> - OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
> - MutableArrayRef<int> Mask(MaskStorage);
> -
> - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> -
> - // Whenever we can lower this as a zext, that instruction is strictly faster
> - // than any alternative.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
> - DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
> - return ZExt;
> -
> - auto isV1 = [](int M) { return M >= 0 && M < 8; };
> - auto isV2 = [](int M) { return M >= 8; };
> -
> - int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
> - int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
> -
> - if (NumV2Inputs == 0)
> - return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
> -
> - assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
> - "to be V1-input shuffles.");
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
> - return Shift;
> -
> - // There are special ways we can lower some single-element blends.
> - if (NumV2Inputs == 1)
> - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
> - Mask, Subtarget, DAG))
> - return V;
> -
> - // We have different paths for blend lowering, but they all must use the
> - // *exact* same predicate.
> - bool IsBlendSupported = Subtarget->hasSSE41();
> - if (IsBlendSupported)
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - if (SDValue Masked =
> - lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
> - return Masked;
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
> -
> - // Try to use byte rotation instructions.
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> -
> - if (SDValue BitBlend =
> - lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
> - return BitBlend;
> -
> - if (SDValue Unpack =
> - lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
> - return Unpack;
> -
> - // If we can't directly blend but can use PSHUFB, that will be better as it
> - // can both shuffle and set up the inefficient blend.
> - if (!IsBlendSupported && Subtarget->hasSSSE3()) {
> - bool V1InUse, V2InUse;
> - return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
> - V1InUse, V2InUse);
> - }
> -
> - // We can always bit-blend if we have to so the fallback strategy is to
> - // decompose into single-input permutes and blends.
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
> - Mask, DAG);
> -}
> -
> -/// \brief Check whether a compaction lowering can be done by dropping even
> -/// elements and compute how many times even elements must be dropped.
> -///
> -/// This handles shuffles which take every Nth element where N is a power of
> -/// two. Example shuffle masks:
> -///
> -/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
> -/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> -/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
> -/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
> -/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
> -/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
> -///
> -/// Any of these lanes can of course be undef.
> -///
> -/// This routine only supports N <= 3.
> -/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
> -/// for larger N.
> -///
> -/// \returns N above, or the number of times even elements must be dropped if
> -/// there is such a number. Otherwise returns zero.
> -static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
> - // Figure out whether we're looping over two inputs or just one.
> - bool IsSingleInput = isSingleInputShuffleMask(Mask);
> -
> - // The modulus for the shuffle vector entries is based on whether this is
> - // a single input or not.
> - int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
> - assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
> - "We should only be called with masks with a power-of-2 size!");
> -
> - uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
> -
> - // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
> - // and 2^3 simultaneously. This is because we may have ambiguity with
> - // partially undef inputs.
> - bool ViableForN[3] = {true, true, true};
> -
> - for (int i = 0, e = Mask.size(); i < e; ++i) {
> - // Ignore undef lanes, we'll optimistically collapse them to the pattern we
> - // want.
> - if (Mask[i] == -1)
> - continue;
> -
> - bool IsAnyViable = false;
> - for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
> - if (ViableForN[j]) {
> - uint64_t N = j + 1;
> -
> - // The shuffle mask must be equal to (i * 2^N) % M.
> - if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
> - IsAnyViable = true;
> - else
> - ViableForN[j] = false;
> - }
> - // Early exit if we exhaust the possible powers of two.
> - if (!IsAnyViable)
> - break;
> - }
> -
> - for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
> - if (ViableForN[j])
> - return j + 1;
> -
> - // Return 0 as there is no viable power of two.
> - return 0;
> -}
> -
> -/// \brief Generic lowering of v16i8 shuffles.
> -///
> -/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
> -/// detect any complexity reducing interleaving. If that doesn't help, it uses
> -/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
> -/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
> -/// back together.
> -static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
> - assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
> - return Shift;
> -
> - // Try to use byte rotation instructions.
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> -
> - // Try to use a zext lowering.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
> - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
> - return ZExt;
> -
> - int NumV2Elements =
> - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
> -
> - // For single-input shuffles, there are some nicer lowering tricks we can use.
> - if (NumV2Elements == 0) {
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Check whether we can widen this to an i16 shuffle by duplicating bytes.
> - // Notably, this handles splat and partial-splat shuffles more efficiently.
> - // However, it only makes sense if the pre-duplication shuffle simplifies
> - // things significantly. Currently, this means we need to be able to
> - // express the pre-duplication shuffle as an i16 shuffle.
> - //
> - // FIXME: We should check for other patterns which can be widened into an
> - // i16 shuffle as well.
> - auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
> - for (int i = 0; i < 16; i += 2)
> - if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
> - return false;
> -
> - return true;
> - };
> - auto tryToWidenViaDuplication = [&]() -> SDValue {
> - if (!canWidenViaDuplication(Mask))
> - return SDValue();
> - SmallVector<int, 4> LoInputs;
> - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
> - [](int M) { return M >= 0 && M < 8; });
> - std::sort(LoInputs.begin(), LoInputs.end());
> - LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
> - LoInputs.end());
> - SmallVector<int, 4> HiInputs;
> - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
> - [](int M) { return M >= 8; });
> - std::sort(HiInputs.begin(), HiInputs.end());
> - HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
> - HiInputs.end());
> -
> - bool TargetLo = LoInputs.size() >= HiInputs.size();
> - ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
> - ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
> -
> - int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
> - SmallDenseMap<int, int, 8> LaneMap;
> - for (int I : InPlaceInputs) {
> - PreDupI16Shuffle[I/2] = I/2;
> - LaneMap[I] = I;
> - }
> - int j = TargetLo ? 0 : 4, je = j + 4;
> - for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
> - // Check if j is already a shuffle of this input. This happens when
> - // there are two adjacent bytes after we move the low one.
> - if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
> - // If we haven't yet mapped the input, search for a slot into which
> - // we can map it.
> - while (j < je && PreDupI16Shuffle[j] != -1)
> - ++j;
> -
> - if (j == je)
> - // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
> - return SDValue();
> -
> - // Map this input with the i16 shuffle.
> - PreDupI16Shuffle[j] = MovingInputs[i] / 2;
> - }
> -
> - // Update the lane map based on the mapping we ended up with.
> - LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
> - }
> - V1 = DAG.getNode(
> - ISD::BITCAST, DL, MVT::v16i8,
> - DAG.getVectorShuffle(MVT::v8i16, DL,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
> - DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
> -
> - // Unpack the bytes to form the i16s that will be shuffled into place.
> - V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
> - MVT::v16i8, V1, V1);
> -
> - int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> - for (int i = 0; i < 16; ++i)
> - if (Mask[i] != -1) {
> - int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
> - assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
> - if (PostDupI16Shuffle[i / 2] == -1)
> - PostDupI16Shuffle[i / 2] = MappedMask;
> - else
> - assert(PostDupI16Shuffle[i / 2] == MappedMask &&
> - "Conflicting entrties in the original shuffle!");
> - }
> - return DAG.getNode(
> - ISD::BITCAST, DL, MVT::v16i8,
> - DAG.getVectorShuffle(MVT::v8i16, DL,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
> - DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
> - };
> - if (SDValue V = tryToWidenViaDuplication())
> - return V;
> - }
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 0, 16, 1, 17, 2, 18, 3, 19,
> - 4, 20, 5, 21, 6, 22, 7, 23))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 8, 24, 9, 25, 10, 26, 11, 27,
> - 12, 28, 13, 29, 14, 30, 15, 31))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
> -
> - // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
> - // with PSHUFB. It is important to do this before we attempt to generate any
> - // blends but after all of the single-input lowerings. If the single input
> - // lowerings can find an instruction sequence that is faster than a PSHUFB, we
> - // want to preserve that and we can DAG combine any longer sequences into
> - // a PSHUFB in the end. But once we start blending from multiple inputs,
> - // the complexity of DAG combining bad patterns back into PSHUFB is too high,
> - // and there are *very* few patterns that would actually be faster than the
> - // PSHUFB approach because of its ability to zero lanes.
> - //
> - // FIXME: The only exceptions to the above are blends which are exact
> - // interleavings with direct instructions supporting them. We currently don't
> - // handle those well here.
> - if (Subtarget->hasSSSE3()) {
> - bool V1InUse = false;
> - bool V2InUse = false;
> -
> - SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
> - DAG, V1InUse, V2InUse);
> -
> - // If both V1 and V2 are in use and we can use a direct blend or an unpack,
> - // do so. This avoids using them to handle blends-with-zero which is
> - // important as a single pshufb is significantly faster for that.
> - if (V1InUse && V2InUse) {
> - if (Subtarget->hasSSE41())
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
> - Mask, Subtarget, DAG))
> - return Blend;
> -
> - // We can use an unpack to do the blending rather than an or in some
> - // cases. Even though the or may be (very minorly) more efficient, we
> - // preference this lowering because there are common cases where part of
> - // the complexity of the shuffles goes away when we do the final blend as
> - // an unpack.
> - // FIXME: It might be worth trying to detect if the unpack-feeding
> - // shuffles will both be pshufb, in which case we shouldn't bother with
> - // this.
> - if (SDValue Unpack =
> - lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
> - return Unpack;
> - }
> -
> - return PSHUFB;
> - }
> -
> - // There are special ways we can lower some single-element blends.
> - if (NumV2Elements == 1)
> - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
> - Mask, Subtarget, DAG))
> - return V;
> -
> - if (SDValue BitBlend =
> - lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
> - return BitBlend;
> -
> - // Check whether a compaction lowering can be done. This handles shuffles
> - // which take every Nth element for some even N. See the helper function for
> - // details.
> - //
> - // We special case these as they can be particularly efficiently handled with
> - // the PACKUSB instruction on x86 and they show up in common patterns of
> - // rearranging bytes to truncate wide elements.
> - if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
> - // NumEvenDrops is the power of two stride of the elements. Another way of
> - // thinking about it is that we need to drop the even elements this many
> - // times to get the original input.
> - bool IsSingleInput = isSingleInputShuffleMask(Mask);
> -
> - // First we need to zero all the dropped bytes.
> - assert(NumEvenDrops <= 3 &&
> - "No support for dropping even elements more than 3 times.");
> - // We use the mask type to pick which bytes are preserved based on how many
> - // elements are dropped.
> - MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
> - SDValue ByteClearMask =
> - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
> - DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
> - V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
> - if (!IsSingleInput)
> - V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
> -
> - // Now pack things back together.
> - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
> - V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
> - SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
> - for (int i = 1; i < NumEvenDrops; ++i) {
> - Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
> - Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
> - }
> -
> - return Result;
> - }
> -
> - // Handle multi-input cases by blending single-input shuffles.
> - if (NumV2Elements > 0)
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
> - Mask, DAG);
> -
> - // The fallback path for single-input shuffles widens this into two v8i16
> - // vectors with unpacks, shuffles those, and then pulls them back together
> - // with a pack.
> - SDValue V = V1;
> -
> - int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> - int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> - for (int i = 0; i < 16; ++i)
> - if (Mask[i] >= 0)
> - (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
> -
> - SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
> -
> - SDValue VLoHalf, VHiHalf;
> - // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
> - // them out and avoid using UNPCK{L,H} to extract the elements of V as
> - // i16s.
> - if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
> - [](int M) { return M >= 0 && M % 2 == 1; }) &&
> - std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
> - [](int M) { return M >= 0 && M % 2 == 1; })) {
> - // Use a mask to drop the high bytes.
> - VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
> - VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
> - DAG.getConstant(0x00FF, MVT::v8i16));
> -
> - // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
> - VHiHalf = DAG.getUNDEF(MVT::v8i16);
> -
> - // Squash the masks to point directly into VLoHalf.
> - for (int &M : LoBlendMask)
> - if (M >= 0)
> - M /= 2;
> - for (int &M : HiBlendMask)
> - if (M >= 0)
> - M /= 2;
> - } else {
> - // Otherwise just unpack the low half of V into VLoHalf and the high half into
> - // VHiHalf so that we can blend them as i16s.
> - VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
> - VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
> - }
> -
> - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
> - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
> -
> - return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
> -}
> -
> -/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
> -///
> -/// This routine breaks down the specific type of 128-bit shuffle and
> -/// dispatches to the lowering routines accordingly.
> -static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - MVT VT, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - switch (VT.SimpleTy) {
> - case MVT::v2i64:
> - return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v2f64:
> - return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v4i32:
> - return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v4f32:
> - return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v8i16:
> - return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v16i8:
> - return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
> -
> - default:
> - llvm_unreachable("Unimplemented!");
> - }
> -}
> -
> -/// \brief Helper function to test whether a shuffle mask could be
> -/// simplified by widening the elements being shuffled.
> -///
> -/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
> -/// leaves it in an unspecified state.
> -///
> -/// NOTE: This must handle normal vector shuffle masks and *target* vector
> -/// shuffle masks. The latter have the special property of a '-2' representing
> -/// a zero-ed lane of a vector.
> -static bool canWidenShuffleElements(ArrayRef<int> Mask,
> - SmallVectorImpl<int> &WidenedMask) {
> - for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
> - // If both elements are undef, its trivial.
> - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
> - WidenedMask.push_back(SM_SentinelUndef);
> - continue;
> - }
> -
> - // Check for an undef mask and a mask value properly aligned to fit with
> - // a pair of values. If we find such a case, use the non-undef mask's value.
> - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
> - WidenedMask.push_back(Mask[i + 1] / 2);
> - continue;
> - }
> - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
> - WidenedMask.push_back(Mask[i] / 2);
> - continue;
> - }
> -
> - // When zeroing, we need to spread the zeroing across both lanes to widen.
> - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
> - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
> - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
> - WidenedMask.push_back(SM_SentinelZero);
> - continue;
> - }
> - return false;
> - }
> -
> - // Finally check if the two mask values are adjacent and aligned with
> - // a pair.
> - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
> - WidenedMask.push_back(Mask[i] / 2);
> - continue;
> - }
> -
> - // Otherwise we can't safely widen the elements used in this shuffle.
> - return false;
> - }
> - assert(WidenedMask.size() == Mask.size() / 2 &&
> - "Incorrect size of mask after widening the elements!");
> -
> - return true;
> -}
> -
> -/// \brief Generic routine to split vector shuffle into half-sized shuffles.
> -///
> -/// This routine just extracts two subvectors, shuffles them independently, and
> -/// then concatenates them back together. This should work effectively with all
> -/// AVX vector shuffle types.
> -static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - assert(VT.getSizeInBits() >= 256 &&
> - "Only for 256-bit or wider vector shuffles!");
> - assert(V1.getSimpleValueType() == VT && "Bad operand type!");
> - assert(V2.getSimpleValueType() == VT && "Bad operand type!");
> -
> - ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
> - ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
> -
> - int NumElements = VT.getVectorNumElements();
> - int SplitNumElements = NumElements / 2;
> - MVT ScalarVT = VT.getScalarType();
> - MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
> -
> - // Rather than splitting build-vectors, just build two narrower build
> - // vectors. This helps shuffling with splats and zeros.
> - auto SplitVector = [&](SDValue V) {
> - while (V.getOpcode() == ISD::BITCAST)
> - V = V->getOperand(0);
> -
> - MVT OrigVT = V.getSimpleValueType();
> - int OrigNumElements = OrigVT.getVectorNumElements();
> - int OrigSplitNumElements = OrigNumElements / 2;
> - MVT OrigScalarVT = OrigVT.getScalarType();
> - MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
> -
> - SDValue LoV, HiV;
> -
> - auto *BV = dyn_cast<BuildVectorSDNode>(V);
> - if (!BV) {
> - LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
> - DAG.getIntPtrConstant(0));
> - HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
> - DAG.getIntPtrConstant(OrigSplitNumElements));
> - } else {
> -
> - SmallVector<SDValue, 16> LoOps, HiOps;
> - for (int i = 0; i < OrigSplitNumElements; ++i) {
> - LoOps.push_back(BV->getOperand(i));
> - HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
> - }
> - LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
> - HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
> - }
> - return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
> - DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
> - };
> -
> - SDValue LoV1, HiV1, LoV2, HiV2;
> - std::tie(LoV1, HiV1) = SplitVector(V1);
> - std::tie(LoV2, HiV2) = SplitVector(V2);
> -
> - // Now create two 4-way blends of these half-width vectors.
> - auto HalfBlend = [&](ArrayRef<int> HalfMask) {
> - bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
> - SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
> - for (int i = 0; i < SplitNumElements; ++i) {
> - int M = HalfMask[i];
> - if (M >= NumElements) {
> - if (M >= NumElements + SplitNumElements)
> - UseHiV2 = true;
> - else
> - UseLoV2 = true;
> - V2BlendMask.push_back(M - NumElements);
> - V1BlendMask.push_back(-1);
> - BlendMask.push_back(SplitNumElements + i);
> - } else if (M >= 0) {
> - if (M >= SplitNumElements)
> - UseHiV1 = true;
> - else
> - UseLoV1 = true;
> - V2BlendMask.push_back(-1);
> - V1BlendMask.push_back(M);
> - BlendMask.push_back(i);
> - } else {
> - V2BlendMask.push_back(-1);
> - V1BlendMask.push_back(-1);
> - BlendMask.push_back(-1);
> - }
> - }
> -
> - // Because the lowering happens after all combining takes place, we need to
> - // manually combine these blend masks as much as possible so that we create
> - // a minimal number of high-level vector shuffle nodes.
> -
> - // First try just blending the halves of V1 or V2.
> - if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
> - return DAG.getUNDEF(SplitVT);
> - if (!UseLoV2 && !UseHiV2)
> - return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
> - if (!UseLoV1 && !UseHiV1)
> - return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
> -
> - SDValue V1Blend, V2Blend;
> - if (UseLoV1 && UseHiV1) {
> - V1Blend =
> - DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
> - } else {
> - // We only use half of V1 so map the usage down into the final blend mask.
> - V1Blend = UseLoV1 ? LoV1 : HiV1;
> - for (int i = 0; i < SplitNumElements; ++i)
> - if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
> - BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
> - }
> - if (UseLoV2 && UseHiV2) {
> - V2Blend =
> - DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
> - } else {
> - // We only use half of V2 so map the usage down into the final blend mask.
> - V2Blend = UseLoV2 ? LoV2 : HiV2;
> - for (int i = 0; i < SplitNumElements; ++i)
> - if (BlendMask[i] >= SplitNumElements)
> - BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
> - }
> - return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
> - };
> - SDValue Lo = HalfBlend(LoMask);
> - SDValue Hi = HalfBlend(HiMask);
> - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
> -}
> -
> -/// \brief Either split a vector in halves or decompose the shuffles and the
> -/// blend.
> -///
> -/// This is provided as a good fallback for many lowerings of non-single-input
> -/// shuffles with more than one 128-bit lane. In those cases, we want to select
> -/// between splitting the shuffle into 128-bit components and stitching those
> -/// back together vs. extracting the single-input shuffles and blending those
> -/// results.
> -static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
> - "lower single-input shuffles as it "
> - "could then recurse on itself.");
> - int Size = Mask.size();
> -
> - // If this can be modeled as a broadcast of two elements followed by a blend,
> - // prefer that lowering. This is especially important because broadcasts can
> - // often fold with memory operands.
> - auto DoBothBroadcast = [&] {
> - int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
> - for (int M : Mask)
> - if (M >= Size) {
> - if (V2BroadcastIdx == -1)
> - V2BroadcastIdx = M - Size;
> - else if (M - Size != V2BroadcastIdx)
> - return false;
> - } else if (M >= 0) {
> - if (V1BroadcastIdx == -1)
> - V1BroadcastIdx = M;
> - else if (M != V1BroadcastIdx)
> - return false;
> - }
> - return true;
> - };
> - if (DoBothBroadcast())
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
> - DAG);
> -
> - // If the inputs all stem from a single 128-bit lane of each input, then we
> - // split them rather than blending because the split will decompose to
> - // unusually few instructions.
> - int LaneCount = VT.getSizeInBits() / 128;
> - int LaneSize = Size / LaneCount;
> - SmallBitVector LaneInputs[2];
> - LaneInputs[0].resize(LaneCount, false);
> - LaneInputs[1].resize(LaneCount, false);
> - for (int i = 0; i < Size; ++i)
> - if (Mask[i] >= 0)
> - LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
> - if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
> - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
> -
> - // Otherwise, just fall back to decomposed shuffles and a blend. This requires
> - // that the decomposed single-input shuffles don't end up here.
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
> -}
> -
> -/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
> -/// a permutation and blend of those lanes.
> -///
> -/// This essentially blends the out-of-lane inputs to each lane into the lane
> -/// from a permuted copy of the vector. This lowering strategy results in four
> -/// instructions in the worst case for a single-input cross lane shuffle which
> -/// is lower than any other fully general cross-lane shuffle strategy I'm aware
> -/// of. Special cases for each particular shuffle pattern should be handled
> -/// prior to trying this lowering.
> -static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
> - SDValue V1, SDValue V2,
> - ArrayRef<int> Mask,
> - SelectionDAG &DAG) {
> - // FIXME: This should probably be generalized for 512-bit vectors as well.
> - assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
> - int LaneSize = Mask.size() / 2;
> -
> - // If there are only inputs from one 128-bit lane, splitting will in fact be
> - // less expensive. The flags track wether the given lane contains an element
> - // that crosses to another lane.
> - bool LaneCrossing[2] = {false, false};
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
> - LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
> - if (!LaneCrossing[0] || !LaneCrossing[1])
> - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
> -
> - if (isSingleInputShuffleMask(Mask)) {
> - SmallVector<int, 32> FlippedBlendMask;
> - for (int i = 0, Size = Mask.size(); i < Size; ++i)
> - FlippedBlendMask.push_back(
> - Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
> - ? Mask[i]
> - : Mask[i] % LaneSize +
> - (i / LaneSize) * LaneSize + Size));
> -
> - // Flip the vector, and blend the results which should now be in-lane. The
> - // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
> - // 5 for the high source. The value 3 selects the high half of source 2 and
> - // the value 2 selects the low half of source 2. We only use source 2 to
> - // allow folding it into a memory operand.
> - unsigned PERMMask = 3 | 2 << 4;
> - SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
> - V1, DAG.getConstant(PERMMask, MVT::i8));
> - return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
> - }
> -
> - // This now reduces to two single-input shuffles of V1 and V2 which at worst
> - // will be handled by the above logic and a blend of the results, much like
> - // other patterns in AVX.
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
> -}
> -
> -/// \brief Handle lowering 2-lane 128-bit shuffles.
> -static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
> - SDValue V2, ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - // Blends are faster and handle all the non-lane-crossing cases.
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
> - VT.getVectorNumElements() / 2);
> - // Check for patterns which can be matched with a single insert of a 128-bit
> - // subvector.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
> - isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
> - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
> - DAG.getIntPtrConstant(0));
> - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
> - Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
> - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
> - }
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
> - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
> - DAG.getIntPtrConstant(0));
> - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
> - DAG.getIntPtrConstant(2));
> - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
> - }
> -
> - // Otherwise form a 128-bit permutation.
> - // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
> - unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
> - return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
> - DAG.getConstant(PermMask, MVT::i8));
> -}
> -
> -/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
> -/// shuffling each lane.
> -///
> -/// This will only succeed when the result of fixing the 128-bit lanes results
> -/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
> -/// each 128-bit lanes. This handles many cases where we can quickly blend away
> -/// the lane crosses early and then use simpler shuffles within each lane.
> -///
> -/// FIXME: It might be worthwhile at some point to support this without
> -/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
> -/// in x86 only floating point has interesting non-repeating shuffles, and even
> -/// those are still *marginally* more expensive.
> -static SDValue lowerVectorShuffleByMerging128BitLanes(
> - SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> - const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> - assert(!isSingleInputShuffleMask(Mask) &&
> - "This is only useful with multiple inputs.");
> -
> - int Size = Mask.size();
> - int LaneSize = 128 / VT.getScalarSizeInBits();
> - int NumLanes = Size / LaneSize;
> - assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
> -
> - // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
> - // check whether the in-128-bit lane shuffles share a repeating pattern.
> - SmallVector<int, 4> Lanes;
> - Lanes.resize(NumLanes, -1);
> - SmallVector<int, 4> InLaneMask;
> - InLaneMask.resize(LaneSize, -1);
> - for (int i = 0; i < Size; ++i) {
> - if (Mask[i] < 0)
> - continue;
> + // If there is a free slot in the source half mask adjacent to one of
> + // the inputs, place the other input in it. We use (Index XOR 1) to
> + // compute an adjacent index.
> + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
> + SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
> + SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
> + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
> + InputsFixed[1] = InputsFixed[0] ^ 1;
> + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
> + SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
> + SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
> + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
> + InputsFixed[0] = InputsFixed[1] ^ 1;
> + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
> + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
> + // The two inputs are in the same DWord but it is clobbered and the
> + // adjacent DWord isn't used at all. Move both inputs to the free
> + // slot.
> + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
> + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
> + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
> + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
> + } else {
> + // The only way we hit this point is if there is no clobbering
> + // (because there are no off-half inputs to this half) and there is no
> + // free slot adjacent to one of the inputs. In this case, we have to
> + // swap an input with a non-input.
> + for (int i = 0; i < 4; ++i)
> + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
> + "We can't handle any clobbers here!");
> + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
> + "Cannot have adjacent inputs here!");
>
> - int j = i / LaneSize;
> + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
> + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
>
> - if (Lanes[j] < 0) {
> - // First entry we've seen for this lane.
> - Lanes[j] = Mask[i] / LaneSize;
> - } else if (Lanes[j] != Mask[i] / LaneSize) {
> - // This doesn't match the lane selected previously!
> - return SDValue();
> - }
> + // We also have to update the final source mask in this case because
> + // it may need to undo the above swap.
> + for (int &M : FinalSourceHalfMask)
> + if (M == (InputsFixed[0] ^ 1) + SourceOffset)
> + M = InputsFixed[1] + SourceOffset;
> + else if (M == InputsFixed[1] + SourceOffset)
> + M = (InputsFixed[0] ^ 1) + SourceOffset;
>
> - // Check that within each lane we have a consistent shuffle mask.
> - int k = i % LaneSize;
> - if (InLaneMask[k] < 0) {
> - InLaneMask[k] = Mask[i] % LaneSize;
> - } else if (InLaneMask[k] != Mask[i] % LaneSize) {
> - // This doesn't fit a repeating in-lane mask.
> - return SDValue();
> - }
> - }
> + InputsFixed[1] = InputsFixed[0] ^ 1;
> + }
>
> - // First shuffle the lanes into place.
> - MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
> - VT.getSizeInBits() / 64);
> - SmallVector<int, 8> LaneMask;
> - LaneMask.resize(NumLanes * 2, -1);
> - for (int i = 0; i < NumLanes; ++i)
> - if (Lanes[i] >= 0) {
> - LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
> - LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
> + // Point everything at the fixed inputs.
> + for (int &M : HalfMask)
> + if (M == IncomingInputs[0])
> + M = InputsFixed[0] + SourceOffset;
> + else if (M == IncomingInputs[1])
> + M = InputsFixed[1] + SourceOffset;
> +
> + IncomingInputs[0] = InputsFixed[0] + SourceOffset;
> + IncomingInputs[1] = InputsFixed[1] + SourceOffset;
> + }
> + } else {
> + llvm_unreachable("Unhandled input size!");
> }
>
> - V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
> - SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
> + // Now hoist the DWord down to the right half.
> + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
> + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
> + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
> + for (int &M : HalfMask)
> + for (int Input : IncomingInputs)
> + if (M == Input)
> + M = FreeDWord * 2 + Input % 2;
> + };
> + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
> + /*SourceOffset*/ 4, /*DestOffset*/ 0);
> + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
> + /*SourceOffset*/ 0, /*DestOffset*/ 4);
>
> - // Cast it back to the type we actually want.
> - LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
> + // Now enact all the shuffles we've computed to move the inputs into their
> + // target half.
> + if (!isNoopShuffleMask(PSHUFLMask))
> + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
> + getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
> + if (!isNoopShuffleMask(PSHUFHMask))
> + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
> + getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
> + if (!isNoopShuffleMask(PSHUFDMask))
> + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
> + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
>
> - // Now do a simple shuffle that isn't lane crossing.
> - SmallVector<int, 8> NewMask;
> - NewMask.resize(Size, -1);
> - for (int i = 0; i < Size; ++i)
> - if (Mask[i] >= 0)
> - NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
> - assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
> - "Must not introduce lane crosses at this point!");
> + // At this point, each half should contain all its inputs, and we can then
> + // just shuffle them into their final position.
> + assert(std::count_if(LoMask.begin(), LoMask.end(),
> + [](int M) { return M >= 4; }) == 0 &&
> + "Failed to lift all the high half inputs to the low mask!");
> + assert(std::count_if(HiMask.begin(), HiMask.end(),
> + [](int M) { return M >= 0 && M < 4; }) == 0 &&
> + "Failed to lift all the low half inputs to the high mask!");
>
> - return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
> -}
> + // Do a half shuffle for the low mask.
> + if (!isNoopShuffleMask(LoMask))
> + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
> + getV4X86ShuffleImm8ForMask(LoMask, DAG));
>
> -/// \brief Test whether the specified input (0 or 1) is in-place blended by the
> -/// given mask.
> -///
> -/// This returns true if the elements from a particular input are already in the
> -/// slot required by the given mask and require no permutation.
> -static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
> - assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
> - int Size = Mask.size();
> - for (int i = 0; i < Size; ++i)
> - if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
> - return false;
> + // Do a half shuffle with the high mask after shifting its values down.
> + for (int &M : HiMask)
> + if (M >= 0)
> + M -= 4;
> + if (!isNoopShuffleMask(HiMask))
> + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
> + getV4X86ShuffleImm8ForMask(HiMask, DAG));
>
> - return true;
> + return V;
> }
>
> -/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
> -///
> -/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
> -/// isn't available.
> -static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> -
> - SmallVector<int, 4> WidenedMask;
> - if (canWidenShuffleElements(Mask, WidenedMask))
> - return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
> - DAG);
> -
> - if (isSingleInputShuffleMask(Mask)) {
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // Use low duplicate instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
> - return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
> +/// \brief Helper to form a PSHUFB-based shuffle+blend.
> +static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG, bool &V1InUse,
> + bool &V2InUse) {
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> + SDValue V1Mask[16];
> + SDValue V2Mask[16];
> + V1InUse = false;
> + V2InUse = false;
>
> - if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
> - // Non-half-crossing single input shuffles can be lowerid with an
> - // interleaved permutation.
> - unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
> - ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
> - return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
> - DAG.getConstant(VPERMILPMask, MVT::i8));
> + int Size = Mask.size();
> + int Scale = 16 / Size;
> + for (int i = 0; i < 16; ++i) {
> + if (Mask[i / Scale] == -1) {
> + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
> + } else {
> + const int ZeroMask = 0x80;
> + int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
> + : ZeroMask;
> + int V2Idx = Mask[i / Scale] < Size
> + ? ZeroMask
> + : (Mask[i / Scale] - Size) * Scale + i % Scale;
> + if (Zeroable[i / Scale])
> + V1Idx = V2Idx = ZeroMask;
> + V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
> + V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
> + V1InUse |= (ZeroMask != V1Idx);
> + V2InUse |= (ZeroMask != V2Idx);
> }
> -
> - // With AVX2 we have direct support for this permutation.
> - if (Subtarget->hasAVX2())
> - return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
> - getV4X86ShuffleImm8ForMask(Mask, DAG));
> -
> - // Otherwise, fall back.
> - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
> - DAG);
> - }
> -
> - // X86 has dedicated unpack instructions that can handle specific blend
> - // operations: UNPCKH and UNPCKL.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
> -
> - // If we have a single input to the zero element, insert that into V1 if we
> - // can do so cheaply.
> - int NumV2Elements =
> - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
> - if (NumV2Elements == 1 && Mask[0] >= 4)
> - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> - MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
> - return Insertion;
> -
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - // Check if the blend happens to exactly fit that of SHUFPD.
> - if ((Mask[0] == -1 || Mask[0] < 2) &&
> - (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
> - (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
> - (Mask[3] == -1 || Mask[3] >= 6)) {
> - unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
> - ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
> - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
> - DAG.getConstant(SHUFPDMask, MVT::i8));
> - }
> - if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
> - (Mask[1] == -1 || Mask[1] < 2) &&
> - (Mask[2] == -1 || Mask[2] >= 6) &&
> - (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
> - unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
> - ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
> - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
> - DAG.getConstant(SHUFPDMask, MVT::i8));
> }
>
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle. However, if we have AVX2 and either inputs are already in place,
> - // we will be able to shuffle even across lanes the other input in a single
> - // instruction so skip this pattern.
> - if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
> - isShuffleMaskInputInPlace(1, Mask))))
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + if (V1InUse)
> + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
> + if (V2InUse)
> + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
>
> - // If we have AVX2 then we always want to lower with a blend because an v4 we
> - // can fully permute the elements.
> - if (Subtarget->hasAVX2())
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
> - Mask, DAG);
> + // If we need shuffled inputs from both, blend the two.
> + SDValue V;
> + if (V1InUse && V2InUse)
> + V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
> + else
> + V = V1InUse ? V1 : V2;
>
> - // Otherwise fall back on generic lowering.
> - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
> + // Cast the result back to the correct type.
> + return DAG.getNode(ISD::BITCAST, DL, VT, V);
> }
>
> -/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
> +/// \brief Generic lowering of 8-lane i16 shuffles.
> ///
> -/// This routine is only called when we have AVX2 and thus a reasonable
> -/// instruction set for v4i64 shuffling..
> -static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> +/// This handles both single-input shuffles and combined shuffle/blends with
> +/// two inputs. The single input shuffles are immediately delegated to
> +/// a dedicated lowering routine.
> +///
> +/// The blends are lowered in one of three fundamental ways. If there are few
> +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
> +/// of the input is significantly cheaper when lowered as an interleaving of
> +/// the two inputs, try to interleave them. Otherwise, blend the low and high
> +/// halves of the inputs separately (making them have relatively few inputs)
> +/// and then concatenate them.
> +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> const X86Subtarget *Subtarget,
> SelectionDAG &DAG) {
> SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
> ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> - assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
> + ArrayRef<int> OrigMask = SVOp->getMask();
> + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
> + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
> + MutableArrayRef<int> Mask(MaskStorage);
>
> - SmallVector<int, 4> WidenedMask;
> - if (canWidenShuffleElements(Mask, WidenedMask))
> - return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
> - DAG);
> + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
>
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> + // Whenever we can lower this as a zext, that instruction is strictly faster
> + // than any alternative.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
> + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
> + return ZExt;
>
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> + auto isV1 = [](int M) { return M >= 0 && M < 8; };
> + auto isV2 = [](int M) { return M >= 8; };
>
> - // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
> - // use lower latency instructions that will operate on both 128-bit lanes.
> - SmallVector<int, 2> RepeatedMask;
> - if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
> - if (isSingleInputShuffleMask(Mask)) {
> - int PSHUFDMask[] = {-1, -1, -1, -1};
> - for (int i = 0; i < 2; ++i)
> - if (RepeatedMask[i] >= 0) {
> - PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
> - PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
> - }
> - return DAG.getNode(
> - ISD::BITCAST, DL, MVT::v4i64,
> - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
> - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> - }
> - }
> + int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
> + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
>
> - // AVX2 provides a direct instruction for permuting a single input across
> - // lanes.
> - if (isSingleInputShuffleMask(Mask))
> - return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
> - getV4X86ShuffleImm8ForMask(Mask, DAG));
> + if (NumV2Inputs == 0)
> + return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
> +
> + assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
> + "to be V1-input shuffles.");
>
> // Try to use shift instructions.
> if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
> + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
> return Shift;
>
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
> -
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle. However, if we have AVX2 and either inputs are already in place,
> - // we will be able to shuffle even across lanes the other input in a single
> - // instruction so skip this pattern.
> - if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
> - isShuffleMaskInputInPlace(1, Mask))))
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + // There are special ways we can lower some single-element blends.
> + if (NumV2Inputs == 1)
> + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
> + Mask, Subtarget, DAG))
> + return V;
>
> - // Otherwise fall back on generic blend lowering.
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
> - Mask, DAG);
> -}
> + // We have different paths for blend lowering, but they all must use the
> + // *exact* same predicate.
> + bool IsBlendSupported = Subtarget->hasSSE41();
> + if (IsBlendSupported)
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> -/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
> -///
> -/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
> -/// isn't available.
> -static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> + if (SDValue Masked =
> + lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
> + return Masked;
>
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
>
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> + // Try to use byte rotation instructions.
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> - // If the shuffle mask is repeated in each 128-bit lane, we have many more
> - // options to efficiently lower the shuffle.
> - SmallVector<int, 4> RepeatedMask;
> - if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
> - assert(RepeatedMask.size() == 4 &&
> - "Repeated masks must be half the mask width!");
> + if (SDValue BitBlend =
> + lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
> + return BitBlend;
>
> - // Use even/odd duplicate instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
> - return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
> - return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
> + if (SDValue Unpack =
> + lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
> + return Unpack;
>
> - if (isSingleInputShuffleMask(Mask))
> - return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
> - getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
> + // If we can't directly blend but can use PSHUFB, that will be better as it
> + // can both shuffle and set up the inefficient blend.
> + if (!IsBlendSupported && Subtarget->hasSSSE3()) {
> + bool V1InUse, V2InUse;
> + return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
> + V1InUse, V2InUse);
> + }
>
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
> + // We can always bit-blend if we have to so the fallback strategy is to
> + // decompose into single-input permutes and blends.
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
> + Mask, DAG);
> +}
>
> - // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
> - // have already handled any direct blends. We also need to squash the
> - // repeated mask into a simulated v4f32 mask.
> - for (int i = 0; i < 4; ++i)
> - if (RepeatedMask[i] >= 8)
> - RepeatedMask[i] -= 4;
> - return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
> - }
> +/// \brief Check whether a compaction lowering can be done by dropping even
> +/// elements and compute how many times even elements must be dropped.
> +///
> +/// This handles shuffles which take every Nth element where N is a power of
> +/// two. Example shuffle masks:
> +///
> +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
> +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
> +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
> +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
> +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
> +///
> +/// Any of these lanes can of course be undef.
> +///
> +/// This routine only supports N <= 3.
> +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
> +/// for larger N.
> +///
> +/// \returns N above, or the number of times even elements must be dropped if
> +/// there is such a number. Otherwise returns zero.
> +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
> + // Figure out whether we're looping over two inputs or just one.
> + bool IsSingleInput = isSingleInputShuffleMask(Mask);
>
> - // If we have a single input shuffle with different shuffle patterns in the
> - // two 128-bit lanes use the variable mask to VPERMILPS.
> - if (isSingleInputShuffleMask(Mask)) {
> - SDValue VPermMask[8];
> - for (int i = 0; i < 8; ++i)
> - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
> - : DAG.getConstant(Mask[i], MVT::i32);
> - if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
> - return DAG.getNode(
> - X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
> + // The modulus for the shuffle vector entries is based on whether this is
> + // a single input or not.
> + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
> + assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
> + "We should only be called with masks with a power-of-2 size!");
>
> - if (Subtarget->hasAVX2())
> - return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
> - DAG.getNode(ISD::BUILD_VECTOR, DL,
> - MVT::v8i32, VPermMask)),
> - V1);
> + uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
>
> - // Otherwise, fall back.
> - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
> - DAG);
> - }
> + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
> + // and 2^3 simultaneously. This is because we may have ambiguity with
> + // partially undef inputs.
> + bool ViableForN[3] = {true, true, true};
>
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle.
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + for (int i = 0, e = Mask.size(); i < e; ++i) {
> + // Ignore undef lanes, we'll optimistically collapse them to the pattern we
> + // want.
> + if (Mask[i] == -1)
> + continue;
>
> - // If we have AVX2 then we always want to lower with a blend because at v8 we
> - // can fully permute the elements.
> - if (Subtarget->hasAVX2())
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
> - Mask, DAG);
> + bool IsAnyViable = false;
> + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
> + if (ViableForN[j]) {
> + uint64_t N = j + 1;
>
> - // Otherwise fall back on generic lowering.
> - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
> + // The shuffle mask must be equal to (i * 2^N) % M.
> + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
> + IsAnyViable = true;
> + else
> + ViableForN[j] = false;
> + }
> + // Early exit if we exhaust the possible powers of two.
> + if (!IsAnyViable)
> + break;
> + }
> +
> + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
> + if (ViableForN[j])
> + return j + 1;
> +
> + // Return 0 as there is no viable power of two.
> + return 0;
> }
>
> -/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
> +/// \brief Generic lowering of v16i8 shuffles.
> ///
> -/// This routine is only called when we have AVX2 and thus a reasonable
> -/// instruction set for v8i32 shuffling..
> -static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
> +/// detect any complexity reducing interleaving. If that doesn't help, it uses
> +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
> +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
> +/// back together.
> +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> const X86Subtarget *Subtarget,
> SelectionDAG &DAG) {
> SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
> + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
> ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> - assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
> -
> - // Whenever we can lower this as a zext, that instruction is strictly faster
> - // than any alternative. It also allows us to fold memory operands into the
> - // shuffle in many cases.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
> - Mask, Subtarget, DAG))
> - return ZExt;
> -
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> -
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> -
> - // If the shuffle mask is repeated in each 128-bit lane we can use more
> - // efficient instructions that mirror the shuffles across the two 128-bit
> - // lanes.
> - SmallVector<int, 4> RepeatedMask;
> - if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
> - assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
> - if (isSingleInputShuffleMask(Mask))
> - return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
> - getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
> -
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
> - if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
> - }
> + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
>
> // Try to use shift instructions.
> if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
> + lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
> return Shift;
>
> + // Try to use byte rotation instructions.
> if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
> + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
> return Rotate;
>
> - // If the shuffle patterns aren't repeated but it is a single input, directly
> - // generate a cross-lane VPERMD instruction.
> - if (isSingleInputShuffleMask(Mask)) {
> - SDValue VPermMask[8];
> - for (int i = 0; i < 8; ++i)
> - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
> - : DAG.getConstant(Mask[i], MVT::i32);
> - return DAG.getNode(
> - X86ISD::VPERMV, DL, MVT::v8i32,
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
> - }
> + // Try to use a zext lowering.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
> + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
> + return ZExt;
>
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle.
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + int NumV2Elements =
> + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
>
> - // Otherwise fall back on generic blend lowering.
> - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
> - Mask, DAG);
> -}
> + // For single-input shuffles, there are some nicer lowering tricks we can use.
> + if (NumV2Elements == 0) {
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> -/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
> -///
> -/// This routine is only called when we have AVX2 and thus a reasonable
> -/// instruction set for v16i16 shuffling..
> -static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
> - assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
> + // Check whether we can widen this to an i16 shuffle by duplicating bytes.
> + // Notably, this handles splat and partial-splat shuffles more efficiently.
> + // However, it only makes sense if the pre-duplication shuffle simplifies
> + // things significantly. Currently, this means we need to be able to
> + // express the pre-duplication shuffle as an i16 shuffle.
> + //
> + // FIXME: We should check for other patterns which can be widened into an
> + // i16 shuffle as well.
> + auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
> + for (int i = 0; i < 16; i += 2)
> + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
> + return false;
>
> - // Whenever we can lower this as a zext, that instruction is strictly faster
> - // than any alternative. It also allows us to fold memory operands into the
> - // shuffle in many cases.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
> - Mask, Subtarget, DAG))
> - return ZExt;
> + return true;
> + };
> + auto tryToWidenViaDuplication = [&]() -> SDValue {
> + if (!canWidenViaDuplication(Mask))
> + return SDValue();
> + SmallVector<int, 4> LoInputs;
> + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
> + [](int M) { return M >= 0 && M < 8; });
> + std::sort(LoInputs.begin(), LoInputs.end());
> + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
> + LoInputs.end());
> + SmallVector<int, 4> HiInputs;
> + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
> + [](int M) { return M >= 8; });
> + std::sort(HiInputs.begin(), HiInputs.end());
> + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
> + HiInputs.end());
>
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> + bool TargetLo = LoInputs.size() >= HiInputs.size();
> + ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
> + ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
>
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
> + SmallDenseMap<int, int, 8> LaneMap;
> + for (int I : InPlaceInputs) {
> + PreDupI16Shuffle[I/2] = I/2;
> + LaneMap[I] = I;
> + }
> + int j = TargetLo ? 0 : 4, je = j + 4;
> + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
> + // Check if j is already a shuffle of this input. This happens when
> + // there are two adjacent bytes after we move the low one.
> + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
> + // If we haven't yet mapped the input, search for a slot into which
> + // we can map it.
> + while (j < je && PreDupI16Shuffle[j] != -1)
> + ++j;
> +
> + if (j == je)
> + // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
> + return SDValue();
> +
> + // Map this input with the i16 shuffle.
> + PreDupI16Shuffle[j] = MovingInputs[i] / 2;
> + }
> +
> + // Update the lane map based on the mapping we ended up with.
> + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
> + }
> + V1 = DAG.getNode(
> + ISD::BITCAST, DL, MVT::v16i8,
> + DAG.getVectorShuffle(MVT::v8i16, DL,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
> + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
> +
> + // Unpack the bytes to form the i16s that will be shuffled into place.
> + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
> + MVT::v16i8, V1, V1);
> +
> + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> + for (int i = 0; i < 16; ++i)
> + if (Mask[i] != -1) {
> + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
> + assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
> + if (PostDupI16Shuffle[i / 2] == -1)
> + PostDupI16Shuffle[i / 2] = MappedMask;
> + else
> + assert(PostDupI16Shuffle[i / 2] == MappedMask &&
> + "Conflicting entrties in the original shuffle!");
> + }
> + return DAG.getNode(
> + ISD::BITCAST, DL, MVT::v16i8,
> + DAG.getVectorShuffle(MVT::v8i16, DL,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
> + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
> + };
> + if (SDValue V = tryToWidenViaDuplication())
> + return V;
> + }
>
> // Use dedicated unpack instructions for masks that match their pattern.
> if (isShuffleEquivalent(V1, V2, Mask,
> - // First 128-bit lane:
> - 0, 16, 1, 17, 2, 18, 3, 19,
> - // Second 128-bit lane:
> - 8, 24, 9, 25, 10, 26, 11, 27))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
> + 0, 16, 1, 17, 2, 18, 3, 19,
> + 4, 20, 5, 21, 6, 22, 7, 23))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
> if (isShuffleEquivalent(V1, V2, Mask,
> - // First 128-bit lane:
> - 4, 20, 5, 21, 6, 22, 7, 23,
> - // Second 128-bit lane:
> + 8, 24, 9, 25, 10, 26, 11, 27,
> 12, 28, 13, 29, 14, 30, 15, 31))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
> -
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
> - return Shift;
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
>
> - // Try to use byte rotation instructions.
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
> + // with PSHUFB. It is important to do this before we attempt to generate any
> + // blends but after all of the single-input lowerings. If the single input
> + // lowerings can find an instruction sequence that is faster than a PSHUFB, we
> + // want to preserve that and we can DAG combine any longer sequences into
> + // a PSHUFB in the end. But once we start blending from multiple inputs,
> + // the complexity of DAG combining bad patterns back into PSHUFB is too high,
> + // and there are *very* few patterns that would actually be faster than the
> + // PSHUFB approach because of its ability to zero lanes.
> + //
> + // FIXME: The only exceptions to the above are blends which are exact
> + // interleavings with direct instructions supporting them. We currently don't
> + // handle those well here.
> + if (Subtarget->hasSSSE3()) {
> + bool V1InUse = false;
> + bool V2InUse = false;
>
> - if (isSingleInputShuffleMask(Mask)) {
> - // There are no generalized cross-lane shuffle operations available on i16
> - // element types.
> - if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
> - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
> - Mask, DAG);
> + SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
> + DAG, V1InUse, V2InUse);
>
> - SDValue PSHUFBMask[32];
> - for (int i = 0; i < 16; ++i) {
> - if (Mask[i] == -1) {
> - PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
> - continue;
> - }
> + // If both V1 and V2 are in use and we can use a direct blend or an unpack,
> + // do so. This avoids using them to handle blends-with-zero which is
> + // important as a single pshufb is significantly faster for that.
> + if (V1InUse && V2InUse) {
> + if (Subtarget->hasSSE41())
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
> + Mask, Subtarget, DAG))
> + return Blend;
>
> - int M = i < 8 ? Mask[i] : Mask[i] - 8;
> - assert(M >= 0 && M < 8 && "Invalid single-input mask!");
> - PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
> - PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
> + // We can use an unpack to do the blending rather than an or in some
> + // cases. Even though the or may be (very minorly) more efficient, we
> + // preference this lowering because there are common cases where part of
> + // the complexity of the shuffles goes away when we do the final blend as
> + // an unpack.
> + // FIXME: It might be worth trying to detect if the unpack-feeding
> + // shuffles will both be pshufb, in which case we shouldn't bother with
> + // this.
> + if (SDValue Unpack =
> + lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
> + return Unpack;
> }
> - return DAG.getNode(
> - ISD::BITCAST, DL, MVT::v16i16,
> - DAG.getNode(
> - X86ISD::PSHUFB, DL, MVT::v32i8,
> - DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
> +
> + return PSHUFB;
> }
>
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle.
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + // There are special ways we can lower some single-element blends.
> + if (NumV2Elements == 1)
> + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
> + Mask, Subtarget, DAG))
> + return V;
>
> - // Otherwise fall back on generic lowering.
> - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
> -}
> + if (SDValue BitBlend =
> + lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
> + return BitBlend;
>
> -/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
> -///
> -/// This routine is only called when we have AVX2 and thus a reasonable
> -/// instruction set for v32i8 shuffling..
> -static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
> - assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
> + // Check whether a compaction lowering can be done. This handles shuffles
> + // which take every Nth element for some even N. See the helper function for
> + // details.
> + //
> + // We special case these as they can be particularly efficiently handled with
> + // the PACKUSB instruction on x86 and they show up in common patterns of
> + // rearranging bytes to truncate wide elements.
> + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
> + // NumEvenDrops is the power of two stride of the elements. Another way of
> + // thinking about it is that we need to drop the even elements this many
> + // times to get the original input.
> + bool IsSingleInput = isSingleInputShuffleMask(Mask);
>
> - // Whenever we can lower this as a zext, that instruction is strictly faster
> - // than any alternative. It also allows us to fold memory operands into the
> - // shuffle in many cases.
> - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
> - Mask, Subtarget, DAG))
> - return ZExt;
> + // First we need to zero all the dropped bytes.
> + assert(NumEvenDrops <= 3 &&
> + "No support for dropping even elements more than 3 times.");
> + // We use the mask type to pick which bytes are preserved based on how many
> + // elements are dropped.
> + MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
> + SDValue ByteClearMask =
> + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
> + DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
> + V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
> + if (!IsSingleInput)
> + V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
>
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> + // Now pack things back together.
> + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
> + V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
> + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
> + for (int i = 1; i < NumEvenDrops; ++i) {
> + Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
> + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
> + }
>
> - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
> - Subtarget, DAG))
> - return Blend;
> + return Result;
> + }
>
> - // Use dedicated unpack instructions for masks that match their pattern.
> - // Note that these are repeated 128-bit lane unpacks, not unpacks across all
> - // 256-bit lanes.
> - if (isShuffleEquivalent(
> - V1, V2, Mask,
> - // First 128-bit lane:
> - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
> - // Second 128-bit lane:
> - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
> - if (isShuffleEquivalent(
> - V1, V2, Mask,
> - // First 128-bit lane:
> - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
> - // Second 128-bit lane:
> - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
> + // Handle multi-input cases by blending single-input shuffles.
> + if (NumV2Elements > 0)
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
> + Mask, DAG);
>
> - // Try to use shift instructions.
> - if (SDValue Shift =
> - lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
> - return Shift;
> + // The fallback path for single-input shuffles widens this into two v8i16
> + // vectors with unpacks, shuffles those, and then pulls them back together
> + // with a pack.
> + SDValue V = V1;
>
> - // Try to use byte rotation instructions.
> - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
> - return Rotate;
> + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
> + for (int i = 0; i < 16; ++i)
> + if (Mask[i] >= 0)
> + (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
>
> - if (isSingleInputShuffleMask(Mask)) {
> - // There are no generalized cross-lane shuffle operations available on i8
> - // element types.
> - if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
> - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
> - Mask, DAG);
> + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
>
> - SDValue PSHUFBMask[32];
> - for (int i = 0; i < 32; ++i)
> - PSHUFBMask[i] =
> - Mask[i] < 0
> - ? DAG.getUNDEF(MVT::i8)
> - : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
> + SDValue VLoHalf, VHiHalf;
> + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
> + // them out and avoid using UNPCK{L,H} to extract the elements of V as
> + // i16s.
> + if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
> + [](int M) { return M >= 0 && M % 2 == 1; }) &&
> + std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
> + [](int M) { return M >= 0 && M % 2 == 1; })) {
> + // Use a mask to drop the high bytes.
> + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
> + VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
> + DAG.getConstant(0x00FF, MVT::v8i16));
>
> - return DAG.getNode(
> - X86ISD::PSHUFB, DL, MVT::v32i8, V1,
> - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
> - }
> + // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
> + VHiHalf = DAG.getUNDEF(MVT::v8i16);
>
> - // Try to simplify this by merging 128-bit lanes to enable a lane-based
> - // shuffle.
> - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
> - return Result;
> + // Squash the masks to point directly into VLoHalf.
> + for (int &M : LoBlendMask)
> + if (M >= 0)
> + M /= 2;
> + for (int &M : HiBlendMask)
> + if (M >= 0)
> + M /= 2;
> + } else {
> + // Otherwise just unpack the low half of V into VLoHalf and the high half into
> + // VHiHalf so that we can blend them as i16s.
> + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
> + VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
> + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
> + }
>
> - // Otherwise fall back on generic lowering.
> - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
> + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
> + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
> +
> + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
> }
>
> -/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
> +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
> ///
> -/// This routine either breaks down the specific type of a 256-bit x86 vector
> -/// shuffle or splits it into two 128-bit shuffles and fuses the results back
> -/// together based on the available instructions.
> -static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> +/// This routine breaks down the specific type of 128-bit shuffle and
> +/// dispatches to the lowering routines accordingly.
> +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> MVT VT, const X86Subtarget *Subtarget,
> SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> -
> - // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
> - // check for those subtargets here and avoid much of the subtarget querying in
> - // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
> - // ability to manipulate a 256-bit vector with integer types. Since we'll use
> - // floating point types there eventually, just immediately cast everything to
> - // a float and operate entirely in that domain.
> - if (VT.isInteger() && !Subtarget->hasAVX2()) {
> - int ElementBits = VT.getScalarSizeInBits();
> - if (ElementBits < 32)
> - // No floating point type available, decompose into 128-bit vectors.
> - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
> -
> - MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
> - VT.getVectorNumElements());
> - V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
> - }
> -
> switch (VT.SimpleTy) {
> - case MVT::v4f64:
> - return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v4i64:
> - return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v8f32:
> - return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v8i32:
> - return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v16i16:
> - return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v32i8:
> - return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v2i64:
> + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v2f64:
> + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v4i32:
> + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v4f32:
> + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v8i16:
> + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v16i8:
> + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
>
> default:
> - llvm_unreachable("Not a valid 256-bit x86 vector type!");
> + llvm_unreachable("Unimplemented!");
> }
> }
>
> -/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
> -static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> +/// \brief Helper function to test whether a shuffle mask could be
> +/// simplified by widening the elements being shuffled.
> +///
> +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
> +/// leaves it in an unspecified state.
> +///
> +/// NOTE: This must handle normal vector shuffle masks and *target* vector
> +/// shuffle masks. The latter have the special property of a '-2' representing
> +/// a zero-ed lane of a vector.
> +static bool canWidenShuffleElements(ArrayRef<int> Mask,
> + SmallVectorImpl<int> &WidenedMask) {
> + for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
> + // If both elements are undef, its trivial.
> + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
> + WidenedMask.push_back(SM_SentinelUndef);
> + continue;
> + }
>
> - // X86 has dedicated unpack instructions that can handle specific blend
> - // operations: UNPCKH and UNPCKL.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
> + // Check for an undef mask and a mask value properly aligned to fit with
> + // a pair of values. If we find such a case, use the non-undef mask's value.
> + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
> + WidenedMask.push_back(Mask[i + 1] / 2);
> + continue;
> + }
> + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
> + WidenedMask.push_back(Mask[i] / 2);
> + continue;
> + }
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
> -}
> + // When zeroing, we need to spread the zeroing across both lanes to widen.
> + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
> + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
> + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
> + WidenedMask.push_back(SM_SentinelZero);
> + continue;
> + }
> + return false;
> + }
>
> -/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
> -static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
> + // Finally check if the two mask values are adjacent and aligned with
> + // a pair.
> + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
> + WidenedMask.push_back(Mask[i] / 2);
> + continue;
> + }
>
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 0, 16, 1, 17, 4, 20, 5, 21,
> - 8, 24, 9, 25, 12, 28, 13, 29))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 2, 18, 3, 19, 6, 22, 7, 23,
> - 10, 26, 11, 27, 14, 30, 15, 31))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
> + // Otherwise we can't safely widen the elements used in this shuffle.
> + return false;
> + }
> + assert(WidenedMask.size() == Mask.size() / 2 &&
> + "Incorrect size of mask after widening the elements!");
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
> + return true;
> }
>
> -/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
> -static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> +/// \brief Generic routine to split vector shuffle into half-sized shuffles.
> +///
> +/// This routine just extracts two subvectors, shuffles them independently, and
> +/// then concatenates them back together. This should work effectively with all
> +/// AVX vector shuffle types.
> +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + assert(VT.getSizeInBits() >= 256 &&
> + "Only for 256-bit or wider vector shuffles!");
> + assert(V1.getSimpleValueType() == VT && "Bad operand type!");
> + assert(V2.getSimpleValueType() == VT && "Bad operand type!");
>
> - // X86 has dedicated unpack instructions that can handle specific blend
> - // operations: UNPCKH and UNPCKL.
> - if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
> + ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
> + ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
> -}
> + int NumElements = VT.getVectorNumElements();
> + int SplitNumElements = NumElements / 2;
> + MVT ScalarVT = VT.getScalarType();
> + MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
> +
> + // Rather than splitting build-vectors, just build two narrower build
> + // vectors. This helps shuffling with splats and zeros.
> + auto SplitVector = [&](SDValue V) {
> + while (V.getOpcode() == ISD::BITCAST)
> + V = V->getOperand(0);
> +
> + MVT OrigVT = V.getSimpleValueType();
> + int OrigNumElements = OrigVT.getVectorNumElements();
> + int OrigSplitNumElements = OrigNumElements / 2;
> + MVT OrigScalarVT = OrigVT.getScalarType();
> + MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
> +
> + SDValue LoV, HiV;
> +
> + auto *BV = dyn_cast<BuildVectorSDNode>(V);
> + if (!BV) {
> + LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
> + DAG.getIntPtrConstant(0));
> + HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
> + DAG.getIntPtrConstant(OrigSplitNumElements));
> + } else {
> +
> + SmallVector<SDValue, 16> LoOps, HiOps;
> + for (int i = 0; i < OrigSplitNumElements; ++i) {
> + LoOps.push_back(BV->getOperand(i));
> + HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
> + }
> + LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
> + HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
> + }
> + return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
> + DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
> + };
> +
> + SDValue LoV1, HiV1, LoV2, HiV2;
> + std::tie(LoV1, HiV1) = SplitVector(V1);
> + std::tie(LoV2, HiV2) = SplitVector(V2);
> +
> + // Now create two 4-way blends of these half-width vectors.
> + auto HalfBlend = [&](ArrayRef<int> HalfMask) {
> + bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
> + SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
> + for (int i = 0; i < SplitNumElements; ++i) {
> + int M = HalfMask[i];
> + if (M >= NumElements) {
> + if (M >= NumElements + SplitNumElements)
> + UseHiV2 = true;
> + else
> + UseLoV2 = true;
> + V2BlendMask.push_back(M - NumElements);
> + V1BlendMask.push_back(-1);
> + BlendMask.push_back(SplitNumElements + i);
> + } else if (M >= 0) {
> + if (M >= SplitNumElements)
> + UseHiV1 = true;
> + else
> + UseLoV1 = true;
> + V2BlendMask.push_back(-1);
> + V1BlendMask.push_back(M);
> + BlendMask.push_back(i);
> + } else {
> + V2BlendMask.push_back(-1);
> + V1BlendMask.push_back(-1);
> + BlendMask.push_back(-1);
> + }
> + }
>
> -/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
> -static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
> + // Because the lowering happens after all combining takes place, we need to
> + // manually combine these blend masks as much as possible so that we create
> + // a minimal number of high-level vector shuffle nodes.
>
> - // Use dedicated unpack instructions for masks that match their pattern.
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 0, 16, 1, 17, 4, 20, 5, 21,
> - 8, 24, 9, 25, 12, 28, 13, 29))
> - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
> - if (isShuffleEquivalent(V1, V2, Mask,
> - 2, 18, 3, 19, 6, 22, 7, 23,
> - 10, 26, 11, 27, 14, 30, 15, 31))
> - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
> + // First try just blending the halves of V1 or V2.
> + if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
> + return DAG.getUNDEF(SplitVT);
> + if (!UseLoV2 && !UseHiV2)
> + return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
> + if (!UseLoV1 && !UseHiV1)
> + return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
> + SDValue V1Blend, V2Blend;
> + if (UseLoV1 && UseHiV1) {
> + V1Blend =
> + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
> + } else {
> + // We only use half of V1 so map the usage down into the final blend mask.
> + V1Blend = UseLoV1 ? LoV1 : HiV1;
> + for (int i = 0; i < SplitNumElements; ++i)
> + if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
> + BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
> + }
> + if (UseLoV2 && UseHiV2) {
> + V2Blend =
> + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
> + } else {
> + // We only use half of V2 so map the usage down into the final blend mask.
> + V2Blend = UseLoV2 ? LoV2 : HiV2;
> + for (int i = 0; i < SplitNumElements; ++i)
> + if (BlendMask[i] >= SplitNumElements)
> + BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
> + }
> + return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
> + };
> + SDValue Lo = HalfBlend(LoMask);
> + SDValue Hi = HalfBlend(HiMask);
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
> }
>
> -/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
> -static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
> - assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
> +/// \brief Either split a vector in halves or decompose the shuffles and the
> +/// blend.
> +///
> +/// This is provided as a good fallback for many lowerings of non-single-input
> +/// shuffles with more than one 128-bit lane. In those cases, we want to select
> +/// between splitting the shuffle into 128-bit components and stitching those
> +/// back together vs. extracting the single-input shuffles and blending those
> +/// results.
> +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
> + "lower single-input shuffles as it "
> + "could then recurse on itself.");
> + int Size = Mask.size();
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
> -}
> + // If this can be modeled as a broadcast of two elements followed by a blend,
> + // prefer that lowering. This is especially important because broadcasts can
> + // often fold with memory operands.
> + auto DoBothBroadcast = [&] {
> + int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
> + for (int M : Mask)
> + if (M >= Size) {
> + if (V2BroadcastIdx == -1)
> + V2BroadcastIdx = M - Size;
> + else if (M - Size != V2BroadcastIdx)
> + return false;
> + } else if (M >= 0) {
> + if (V1BroadcastIdx == -1)
> + V1BroadcastIdx = M;
> + else if (M != V1BroadcastIdx)
> + return false;
> + }
> + return true;
> + };
> + if (DoBothBroadcast())
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
> + DAG);
>
> -/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
> -static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
> - assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
> - assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
> + // If the inputs all stem from a single 128-bit lane of each input, then we
> + // split them rather than blending because the split will decompose to
> + // unusually few instructions.
> + int LaneCount = VT.getSizeInBits() / 128;
> + int LaneSize = Size / LaneCount;
> + SmallBitVector LaneInputs[2];
> + LaneInputs[0].resize(LaneCount, false);
> + LaneInputs[1].resize(LaneCount, false);
> + for (int i = 0; i < Size; ++i)
> + if (Mask[i] >= 0)
> + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
> + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
> + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
>
> - // FIXME: Implement direct support for this type!
> - return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
> + // Otherwise, just fall back to decomposed shuffles and a blend. This requires
> + // that the decomposed single-input shuffles don't end up here.
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
> }
>
> -/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
> +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
> +/// a permutation and blend of those lanes.
> ///
> -/// This routine either breaks down the specific type of a 512-bit x86 vector
> -/// shuffle or splits it into two 256-bit shuffles and fuses the results back
> -/// together based on the available instructions.
> -static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> - MVT VT, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - SDLoc DL(Op);
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - assert(Subtarget->hasAVX512() &&
> - "Cannot lower 512-bit vectors w/ basic ISA!");
> +/// This essentially blends the out-of-lane inputs to each lane into the lane
> +/// from a permuted copy of the vector. This lowering strategy results in four
> +/// instructions in the worst case for a single-input cross lane shuffle which
> +/// is lower than any other fully general cross-lane shuffle strategy I'm aware
> +/// of. Special cases for each particular shuffle pattern should be handled
> +/// prior to trying this lowering.
> +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
> + SDValue V1, SDValue V2,
> + ArrayRef<int> Mask,
> + SelectionDAG &DAG) {
> + // FIXME: This should probably be generalized for 512-bit vectors as well.
> + assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
> + int LaneSize = Mask.size() / 2;
>
> - // Check for being able to broadcast a single element.
> - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
> - Mask, Subtarget, DAG))
> - return Broadcast;
> + // If there are only inputs from one 128-bit lane, splitting will in fact be
> + // less expensive. The flags track wether the given lane contains an element
> + // that crosses to another lane.
> + bool LaneCrossing[2] = {false, false};
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
> + LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
> + if (!LaneCrossing[0] || !LaneCrossing[1])
> + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
>
> - // Dispatch to each element type for lowering. If we don't have supprot for
> - // specific element type shuffles at 512 bits, immediately split them and
> - // lower them. Each lowering routine of a given type is allowed to assume that
> - // the requisite ISA extensions for that element type are available.
> - switch (VT.SimpleTy) {
> - case MVT::v8f64:
> - return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v16f32:
> - return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v8i64:
> - return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v16i32:
> - return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - case MVT::v32i16:
> - if (Subtarget->hasBWI())
> - return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - break;
> - case MVT::v64i8:
> - if (Subtarget->hasBWI())
> - return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
> - break;
> + if (isSingleInputShuffleMask(Mask)) {
> + SmallVector<int, 32> FlippedBlendMask;
> + for (int i = 0, Size = Mask.size(); i < Size; ++i)
> + FlippedBlendMask.push_back(
> + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
> + ? Mask[i]
> + : Mask[i] % LaneSize +
> + (i / LaneSize) * LaneSize + Size));
>
> - default:
> - llvm_unreachable("Not a valid 512-bit x86 vector type!");
> + // Flip the vector, and blend the results which should now be in-lane. The
> + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
> + // 5 for the high source. The value 3 selects the high half of source 2 and
> + // the value 2 selects the low half of source 2. We only use source 2 to
> + // allow folding it into a memory operand.
> + unsigned PERMMask = 3 | 2 << 4;
> + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
> + V1, DAG.getConstant(PERMMask, MVT::i8));
> + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
> }
>
> - // Otherwise fall back on splitting.
> - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
> + // This now reduces to two single-input shuffles of V1 and V2 which at worst
> + // will be handled by the above logic and a blend of the results, much like
> + // other patterns in AVX.
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
> +}
> +
> +/// \brief Handle lowering 2-lane 128-bit shuffles.
> +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
> + SDValue V2, ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + // Blends are faster and handle all the non-lane-crossing cases.
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
> +
> + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
> + VT.getVectorNumElements() / 2);
> + // Check for patterns which can be matched with a single insert of a 128-bit
> + // subvector.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
> + isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
> + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
> + DAG.getIntPtrConstant(0));
> + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
> + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
> + }
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
> + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
> + DAG.getIntPtrConstant(0));
> + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
> + DAG.getIntPtrConstant(2));
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
> + }
> +
> + // Otherwise form a 128-bit permutation.
> + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
> + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
> + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
> + DAG.getConstant(PermMask, MVT::i8));
> }
>
> -/// \brief Top-level lowering for x86 vector shuffles.
> +/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
> +/// shuffling each lane.
> ///
> -/// This handles decomposition, canonicalization, and lowering of all x86
> -/// vector shuffles. Most of the specific lowering strategies are encapsulated
> -/// above in helper routines. The canonicalization attempts to widen shuffles
> -/// to involve fewer lanes of wider elements, consolidate symmetric patterns
> -/// s.t. only one of the two inputs needs to be tested, etc.
> -static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - ArrayRef<int> Mask = SVOp->getMask();
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - MVT VT = Op.getSimpleValueType();
> - int NumElements = VT.getVectorNumElements();
> - SDLoc dl(Op);
> -
> - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
> +/// This will only succeed when the result of fixing the 128-bit lanes results
> +/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
> +/// each 128-bit lanes. This handles many cases where we can quickly blend away
> +/// the lane crosses early and then use simpler shuffles within each lane.
> +///
> +/// FIXME: It might be worthwhile at some point to support this without
> +/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
> +/// in x86 only floating point has interesting non-repeating shuffles, and even
> +/// those are still *marginally* more expensive.
> +static SDValue lowerVectorShuffleByMerging128BitLanes(
> + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
> + const X86Subtarget *Subtarget, SelectionDAG &DAG) {
> + assert(!isSingleInputShuffleMask(Mask) &&
> + "This is only useful with multiple inputs.");
>
> - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
> - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
> - if (V1IsUndef && V2IsUndef)
> - return DAG.getUNDEF(VT);
> + int Size = Mask.size();
> + int LaneSize = 128 / VT.getScalarSizeInBits();
> + int NumLanes = Size / LaneSize;
> + assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
>
> - // When we create a shuffle node we put the UNDEF node to second operand,
> - // but in some cases the first operand may be transformed to UNDEF.
> - // In this case we should just commute the node.
> - if (V1IsUndef)
> - return DAG.getCommutedVectorShuffle(*SVOp);
> + // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
> + // check whether the in-128-bit lane shuffles share a repeating pattern.
> + SmallVector<int, 4> Lanes;
> + Lanes.resize(NumLanes, -1);
> + SmallVector<int, 4> InLaneMask;
> + InLaneMask.resize(LaneSize, -1);
> + for (int i = 0; i < Size; ++i) {
> + if (Mask[i] < 0)
> + continue;
>
> - // Check for non-undef masks pointing at an undef vector and make the masks
> - // undef as well. This makes it easier to match the shuffle based solely on
> - // the mask.
> - if (V2IsUndef)
> - for (int M : Mask)
> - if (M >= NumElements) {
> - SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
> - for (int &M : NewMask)
> - if (M >= NumElements)
> - M = -1;
> - return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
> - }
> + int j = i / LaneSize;
>
> - // We actually see shuffles that are entirely re-arrangements of a set of
> - // zero inputs. This mostly happens while decomposing complex shuffles into
> - // simple ones. Directly lower these as a buildvector of zeros.
> - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> - if (Zeroable.all())
> - return getZeroVector(VT, Subtarget, DAG, dl);
> + if (Lanes[j] < 0) {
> + // First entry we've seen for this lane.
> + Lanes[j] = Mask[i] / LaneSize;
> + } else if (Lanes[j] != Mask[i] / LaneSize) {
> + // This doesn't match the lane selected previously!
> + return SDValue();
> + }
>
> - // Try to collapse shuffles into using a vector type with fewer elements but
> - // wider element types. We cap this to not form integers or floating point
> - // elements wider than 64 bits, but it might be interesting to form i128
> - // integers to handle flipping the low and high halves of AVX 256-bit vectors.
> - SmallVector<int, 16> WidenedMask;
> - if (VT.getScalarSizeInBits() < 64 &&
> - canWidenShuffleElements(Mask, WidenedMask)) {
> - MVT NewEltVT = VT.isFloatingPoint()
> - ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
> - : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
> - MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
> - // Make sure that the new vector type is legal. For example, v2f64 isn't
> - // legal on SSE1.
> - if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
> - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
> + // Check that within each lane we have a consistent shuffle mask.
> + int k = i % LaneSize;
> + if (InLaneMask[k] < 0) {
> + InLaneMask[k] = Mask[i] % LaneSize;
> + } else if (InLaneMask[k] != Mask[i] % LaneSize) {
> + // This doesn't fit a repeating in-lane mask.
> + return SDValue();
> }
> }
>
> - int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
> - for (int M : SVOp->getMask())
> - if (M < 0)
> - ++NumUndefElements;
> - else if (M < NumElements)
> - ++NumV1Elements;
> - else
> - ++NumV2Elements;
> + // First shuffle the lanes into place.
> + MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
> + VT.getSizeInBits() / 64);
> + SmallVector<int, 8> LaneMask;
> + LaneMask.resize(NumLanes * 2, -1);
> + for (int i = 0; i < NumLanes; ++i)
> + if (Lanes[i] >= 0) {
> + LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
> + LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
> + }
>
> - // Commute the shuffle as needed such that more elements come from V1 than
> - // V2. This allows us to match the shuffle pattern strictly on how many
> - // elements come from V1 without handling the symmetric cases.
> - if (NumV2Elements > NumV1Elements)
> - return DAG.getCommutedVectorShuffle(*SVOp);
> + V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
> + SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
>
> - // When the number of V1 and V2 elements are the same, try to minimize the
> - // number of uses of V2 in the low half of the vector. When that is tied,
> - // ensure that the sum of indices for V1 is equal to or lower than the sum
> - // indices for V2. When those are equal, try to ensure that the number of odd
> - // indices for V1 is lower than the number of odd indices for V2.
> - if (NumV1Elements == NumV2Elements) {
> - int LowV1Elements = 0, LowV2Elements = 0;
> - for (int M : SVOp->getMask().slice(0, NumElements / 2))
> - if (M >= NumElements)
> - ++LowV2Elements;
> - else if (M >= 0)
> - ++LowV1Elements;
> - if (LowV2Elements > LowV1Elements) {
> - return DAG.getCommutedVectorShuffle(*SVOp);
> - } else if (LowV2Elements == LowV1Elements) {
> - int SumV1Indices = 0, SumV2Indices = 0;
> - for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
> - if (SVOp->getMask()[i] >= NumElements)
> - SumV2Indices += i;
> - else if (SVOp->getMask()[i] >= 0)
> - SumV1Indices += i;
> - if (SumV2Indices < SumV1Indices) {
> - return DAG.getCommutedVectorShuffle(*SVOp);
> - } else if (SumV2Indices == SumV1Indices) {
> - int NumV1OddIndices = 0, NumV2OddIndices = 0;
> - for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
> - if (SVOp->getMask()[i] >= NumElements)
> - NumV2OddIndices += i % 2;
> - else if (SVOp->getMask()[i] >= 0)
> - NumV1OddIndices += i % 2;
> - if (NumV2OddIndices < NumV1OddIndices)
> - return DAG.getCommutedVectorShuffle(*SVOp);
> - }
> - }
> - }
> + // Cast it back to the type we actually want.
> + LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
>
> - // For each vector width, delegate to a specialized lowering routine.
> - if (VT.getSizeInBits() == 128)
> - return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
> + // Now do a simple shuffle that isn't lane crossing.
> + SmallVector<int, 8> NewMask;
> + NewMask.resize(Size, -1);
> + for (int i = 0; i < Size; ++i)
> + if (Mask[i] >= 0)
> + NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
> + assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
> + "Must not introduce lane crosses at this point!");
>
> - if (VT.getSizeInBits() == 256)
> - return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
> + return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
> +}
>
> - // Force AVX-512 vectors to be scalarized for now.
> - // FIXME: Implement AVX-512 support!
> - if (VT.getSizeInBits() == 512)
> - return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
> +/// \brief Test whether the specified input (0 or 1) is in-place blended by the
> +/// given mask.
> +///
> +/// This returns true if the elements from a particular input are already in the
> +/// slot required by the given mask and require no permutation.
> +static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
> + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
> + int Size = Mask.size();
> + for (int i = 0; i < Size; ++i)
> + if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
> + return false;
>
> - llvm_unreachable("Unimplemented!");
> + return true;
> }
>
> +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
> +///
> +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
> +/// isn't available.
> +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
>
> -//===----------------------------------------------------------------------===//
> -// Legacy vector shuffle lowering
> -//
> -// This code is the legacy code handling vector shuffles until the above
> -// replaces its functionality and performance.
> -//===----------------------------------------------------------------------===//
> + SmallVector<int, 4> WidenedMask;
> + if (canWidenShuffleElements(Mask, WidenedMask))
> + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
> + DAG);
>
> -static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
> - bool hasInt256, unsigned *MaskOut = nullptr) {
> - MVT EltVT = VT.getVectorElementType();
> + if (isSingleInputShuffleMask(Mask)) {
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // There is no blend with immediate in AVX-512.
> - if (VT.is512BitVector())
> - return false;
> + // Use low duplicate instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
> + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
> +
> + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
> + // Non-half-crossing single input shuffles can be lowerid with an
> + // interleaved permutation.
> + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
> + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
> + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
> + DAG.getConstant(VPERMILPMask, MVT::i8));
> + }
> +
> + // With AVX2 we have direct support for this permutation.
> + if (Subtarget->hasAVX2())
> + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
> + getV4X86ShuffleImm8ForMask(Mask, DAG));
> +
> + // Otherwise, fall back.
> + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
> + DAG);
> + }
>
> - if (!hasSSE41 || EltVT == MVT::i8)
> - return false;
> - if (!hasInt256 && VT == MVT::v16i16)
> - return false;
> + // X86 has dedicated unpack instructions that can handle specific blend
> + // operations: UNPCKH and UNPCKL.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
>
> - unsigned MaskValue = 0;
> - unsigned NumElems = VT.getVectorNumElements();
> - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
> - unsigned NumLanes = (NumElems - 1) / 8 + 1;
> - unsigned NumElemsInLane = NumElems / NumLanes;
> + // If we have a single input to the zero element, insert that into V1 if we
> + // can do so cheaply.
> + int NumV2Elements =
> + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
> + if (NumV2Elements == 1 && Mask[0] >= 4)
> + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
> + MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
> + return Insertion;
>
> - // Blend for v16i16 should be symmetric for both lanes.
> - for (unsigned i = 0; i < NumElemsInLane; ++i) {
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
> - int EltIdx = MaskVals[i];
> + // Check if the blend happens to exactly fit that of SHUFPD.
> + if ((Mask[0] == -1 || Mask[0] < 2) &&
> + (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
> + (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
> + (Mask[3] == -1 || Mask[3] >= 6)) {
> + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
> + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
> + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
> + if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
> + (Mask[1] == -1 || Mask[1] < 2) &&
> + (Mask[2] == -1 || Mask[2] >= 6) &&
> + (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
> + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
> + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
> + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
>
> - if ((EltIdx < 0 || EltIdx == (int)i) &&
> - (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
> - continue;
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle. However, if we have AVX2 and either inputs are already in place,
> + // we will be able to shuffle even across lanes the other input in a single
> + // instruction so skip this pattern.
> + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
> + isShuffleMaskInputInPlace(1, Mask))))
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
> + return Result;
>
> - if (((unsigned)EltIdx == (i + NumElems)) &&
> - (SndLaneEltIdx < 0 ||
> - (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
> - MaskValue |= (1 << i);
> - else
> - return false;
> - }
> + // If we have AVX2 then we always want to lower with a blend because an v4 we
> + // can fully permute the elements.
> + if (Subtarget->hasAVX2())
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
> + Mask, DAG);
>
> - if (MaskOut)
> - *MaskOut = MaskValue;
> - return true;
> + // Otherwise fall back on generic lowering.
> + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
> }
>
> -// Try to lower a shuffle node into a simple blend instruction.
> -// This function assumes isBlendMask returns true for this
> -// SuffleVectorSDNode
> -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
> - unsigned MaskValue,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - MVT VT = SVOp->getSimpleValueType(0);
> - MVT EltVT = VT.getVectorElementType();
> - assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
> - Subtarget->hasInt256() && "Trying to lower a "
> - "VECTOR_SHUFFLE to a Blend but "
> - "with the wrong mask"));
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - // Convert i32 vectors to floating point if it is not AVX2.
> - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
> - MVT BlendVT = VT;
> - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
> - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
> - NumElems);
> - V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
> - V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
> - }
> -
> - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
> - DAG.getConstant(MaskValue, MVT::i32));
> - return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
> -}
> -
> -/// In vector type \p VT, return true if the element at index \p InputIdx
> -/// falls on a different 128-bit lane than \p OutputIdx.
> -static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
> - unsigned OutputIdx) {
> - unsigned EltSize = VT.getVectorElementType().getSizeInBits();
> - return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
> -}
> -
> -/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
> -/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
> -/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
> -/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
> -/// zero.
> -static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
> - SelectionDAG &DAG) {
> - MVT VT = V1.getSimpleValueType();
> - assert(VT.is128BitVector() || VT.is256BitVector());
> +/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
> +///
> +/// This routine is only called when we have AVX2 and thus a reasonable
> +/// instruction set for v4i64 shuffling..
> +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
>
> - MVT EltVT = VT.getVectorElementType();
> - unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
> - unsigned NumElts = VT.getVectorNumElements();
> + SmallVector<int, 4> WidenedMask;
> + if (canWidenShuffleElements(Mask, WidenedMask))
> + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
> + DAG);
>
> - SmallVector<SDValue, 32> PshufbMask;
> - for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
> - int InputIdx = MaskVals[OutputIdx];
> - unsigned InputByteIdx;
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
> - InputByteIdx = 0x80;
> - else {
> - // Cross lane is not allowed.
> - if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
> - return SDValue();
> - InputByteIdx = InputIdx * EltSizeInBytes;
> - // Index is an byte offset within the 128-bit lane.
> - InputByteIdx &= 0xf;
> - }
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - for (unsigned j = 0; j < EltSizeInBytes; ++j) {
> - PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
> - if (InputByteIdx != 0x80)
> - ++InputByteIdx;
> + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
> + // use lower latency instructions that will operate on both 128-bit lanes.
> + SmallVector<int, 2> RepeatedMask;
> + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
> + if (isSingleInputShuffleMask(Mask)) {
> + int PSHUFDMask[] = {-1, -1, -1, -1};
> + for (int i = 0; i < 2; ++i)
> + if (RepeatedMask[i] >= 0) {
> + PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
> + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
> + }
> + return DAG.getNode(
> + ISD::BITCAST, DL, MVT::v4i64,
> + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
> + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
> }
> }
>
> - MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
> - if (ShufVT != VT)
> - V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
> - return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
> - DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
> -}
> + // AVX2 provides a direct instruction for permuting a single input across
> + // lanes.
> + if (isSingleInputShuffleMask(Mask))
> + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
> + getV4X86ShuffleImm8ForMask(Mask, DAG));
>
> -// v8i16 shuffles - Prefer shuffles in the following order:
> -// 1. [all] pshuflw, pshufhw, optional move
> -// 2. [ssse3] 1 x pshufb
> -// 3. [ssse3] 2 x pshufb + 1 x por
> -// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
> -static SDValue
> -LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> - SmallVector<int, 8> MaskVals;
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
> + return Shift;
>
> - // Determine if more than 1 of the words in each of the low and high quadwords
> - // of the result come from the same quadword of one of the two inputs. Undef
> - // mask values count as coming from any quadword, for better codegen.
> - //
> - // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
> - // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
> - unsigned LoQuad[] = { 0, 0, 0, 0 };
> - unsigned HiQuad[] = { 0, 0, 0, 0 };
> - // Indices of quads used.
> - std::bitset<4> InputQuads;
> - for (unsigned i = 0; i < 8; ++i) {
> - unsigned *Quad = i < 4 ? LoQuad : HiQuad;
> - int EltIdx = SVOp->getMaskElt(i);
> - MaskVals.push_back(EltIdx);
> - if (EltIdx < 0) {
> - ++Quad[0];
> - ++Quad[1];
> - ++Quad[2];
> - ++Quad[3];
> - continue;
> - }
> - ++Quad[EltIdx / 4];
> - InputQuads.set(EltIdx / 4);
> - }
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
>
> - int BestLoQuad = -1;
> - unsigned MaxQuad = 1;
> - for (unsigned i = 0; i < 4; ++i) {
> - if (LoQuad[i] > MaxQuad) {
> - BestLoQuad = i;
> - MaxQuad = LoQuad[i];
> - }
> - }
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle. However, if we have AVX2 and either inputs are already in place,
> + // we will be able to shuffle even across lanes the other input in a single
> + // instruction so skip this pattern.
> + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
> + isShuffleMaskInputInPlace(1, Mask))))
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
> + return Result;
>
> - int BestHiQuad = -1;
> - MaxQuad = 1;
> - for (unsigned i = 0; i < 4; ++i) {
> - if (HiQuad[i] > MaxQuad) {
> - BestHiQuad = i;
> - MaxQuad = HiQuad[i];
> - }
> - }
> + // Otherwise fall back on generic blend lowering.
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
> + Mask, DAG);
> +}
>
> - // For SSSE3, If all 8 words of the result come from only 1 quadword of each
> - // of the two input vectors, shuffle them into one input vector so only a
> - // single pshufb instruction is necessary. If there are more than 2 input
> - // quads, disable the next transformation since it does not help SSSE3.
> - bool V1Used = InputQuads[0] || InputQuads[1];
> - bool V2Used = InputQuads[2] || InputQuads[3];
> - if (Subtarget->hasSSSE3()) {
> - if (InputQuads.count() == 2 && V1Used && V2Used) {
> - BestLoQuad = InputQuads[0] ? 0 : 1;
> - BestHiQuad = InputQuads[2] ? 2 : 3;
> - }
> - if (InputQuads.count() > 2) {
> - BestLoQuad = -1;
> - BestHiQuad = -1;
> - }
> - }
> +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
> +///
> +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
> +/// isn't available.
> +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
>
> - // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
> - // the shuffle mask. If a quad is scored as -1, that means that it contains
> - // words from all 4 input quadwords.
> - SDValue NewV;
> - if (BestLoQuad >= 0 || BestHiQuad >= 0) {
> - int MaskV[] = {
> - BestLoQuad < 0 ? 0 : BestLoQuad,
> - BestHiQuad < 0 ? 1 : BestHiQuad
> - };
> - NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
> - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
> - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
> - NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
> -
> - // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
> - // source words for the shuffle, to aid later transformations.
> - bool AllWordsInNewV = true;
> - bool InOrder[2] = { true, true };
> - for (unsigned i = 0; i != 8; ++i) {
> - int idx = MaskVals[i];
> - if (idx != (int)i)
> - InOrder[i/4] = false;
> - if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
> - continue;
> - AllWordsInNewV = false;
> - break;
> - }
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
> - if (AllWordsInNewV) {
> - for (int i = 0; i != 8; ++i) {
> - int idx = MaskVals[i];
> - if (idx < 0)
> - continue;
> - idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
> - if ((idx != i) && idx < 4)
> - pshufhw = false;
> - if ((idx != i) && idx > 3)
> - pshuflw = false;
> - }
> - V1 = NewV;
> - V2Used = false;
> - BestLoQuad = 0;
> - BestHiQuad = 1;
> - }
> -
> - // If we've eliminated the use of V2, and the new mask is a pshuflw or
> - // pshufhw, that's as cheap as it gets. Return the new shuffle.
> - if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
> - unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
> - unsigned TargetMask = 0;
> - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
> - DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
> - TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
> - getShufflePSHUFLWImmediate(SVOp);
> - V1 = NewV.getOperand(0);
> - return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
> - }
> - }
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // Promote splats to a larger type which usually leads to more efficient code.
> - // FIXME: Is this true if pshufb is available?
> - if (SVOp->isSplat())
> - return PromoteSplat(SVOp, DAG);
> -
> - // If we have SSSE3, and all words of the result are from 1 input vector,
> - // case 2 is generated, otherwise case 3 is generated. If no SSSE3
> - // is present, fall back to case 4.
> - if (Subtarget->hasSSSE3()) {
> - SmallVector<SDValue,16> pshufbMask;
> + // If the shuffle mask is repeated in each 128-bit lane, we have many more
> + // options to efficiently lower the shuffle.
> + SmallVector<int, 4> RepeatedMask;
> + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
> + assert(RepeatedMask.size() == 4 &&
> + "Repeated masks must be half the mask width!");
>
> - // If we have elements from both input vectors, set the high bit of the
> - // shuffle mask element to zero out elements that come from V2 in the V1
> - // mask, and elements that come from V1 in the V2 mask, so that the two
> - // results can be OR'd together.
> - bool TwoInputs = V1Used && V2Used;
> - V1 = getPSHUFB(MaskVals, V1, dl, DAG);
> - if (!TwoInputs)
> - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
> -
> - // Calculate the shuffle mask for the second input, shuffle it, and
> - // OR it with the first shuffled input.
> - CommuteVectorShuffleMask(MaskVals, 8);
> - V2 = getPSHUFB(MaskVals, V2, dl, DAG);
> - V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
> - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
> - }
> -
> - // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
> - // and update MaskVals with new element order.
> - std::bitset<8> InOrder;
> - if (BestLoQuad >= 0) {
> - int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
> - for (int i = 0; i != 4; ++i) {
> - int idx = MaskVals[i];
> - if (idx < 0) {
> - InOrder.set(i);
> - } else if ((idx / 4) == BestLoQuad) {
> - MaskV[i] = idx & 3;
> - InOrder.set(i);
> - }
> - }
> - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
> - &MaskV[0]);
> -
> - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
> - NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
> - NewV.getOperand(0),
> - getShufflePSHUFLWImmediate(SVOp), DAG);
> - }
> - }
> -
> - // If BestHi >= 0, generate a pshufhw to put the high elements in order,
> - // and update MaskVals with the new element order.
> - if (BestHiQuad >= 0) {
> - int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
> - for (unsigned i = 4; i != 8; ++i) {
> - int idx = MaskVals[i];
> - if (idx < 0) {
> - InOrder.set(i);
> - } else if ((idx / 4) == BestHiQuad) {
> - MaskV[i] = (idx & 3) + 4;
> - InOrder.set(i);
> - }
> - }
> - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
> - &MaskV[0]);
> -
> - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
> - NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
> - NewV.getOperand(0),
> - getShufflePSHUFHWImmediate(SVOp), DAG);
> - }
> - }
> -
> - // In case BestHi & BestLo were both -1, which means each quadword has a word
> - // from each of the four input quadwords, calculate the InOrder bitvector now
> - // before falling through to the insert/extract cleanup.
> - if (BestLoQuad == -1 && BestHiQuad == -1) {
> - NewV = V1;
> - for (int i = 0; i != 8; ++i)
> - if (MaskVals[i] < 0 || MaskVals[i] == i)
> - InOrder.set(i);
> - }
> -
> - // The other elements are put in the right place using pextrw and pinsrw.
> - for (unsigned i = 0; i != 8; ++i) {
> - if (InOrder[i])
> - continue;
> - int EltIdx = MaskVals[i];
> - if (EltIdx < 0)
> - continue;
> - SDValue ExtOp = (EltIdx < 8) ?
> - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
> - DAG.getIntPtrConstant(EltIdx)) :
> - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
> - DAG.getIntPtrConstant(EltIdx - 8));
> - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
> - DAG.getIntPtrConstant(i));
> - }
> - return NewV;
> -}
> -
> -/// \brief v16i16 shuffles
> -///
> -/// FIXME: We only support generation of a single pshufb currently. We can
> -/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
> -/// well (e.g 2 x pshufb + 1 x por).
> -static SDValue
> -LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> + // Use even/odd duplicate instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
> + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
> + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
>
> - if (V2.getOpcode() != ISD::UNDEF)
> - return SDValue();
> + if (isSingleInputShuffleMask(Mask))
> + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
> + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
>
> - SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
> - return getPSHUFB(MaskVals, V1, dl, DAG);
> -}
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
>
> -// v16i8 shuffles - Prefer shuffles in the following order:
> -// 1. [ssse3] 1 x pshufb
> -// 2. [ssse3] 2 x pshufb + 1 x por
> -// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
> -static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
> - const X86Subtarget* Subtarget,
> - SelectionDAG &DAG) {
> - const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> - ArrayRef<int> MaskVals = SVOp->getMask();
> + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
> + // have already handled any direct blends. We also need to squash the
> + // repeated mask into a simulated v4f32 mask.
> + for (int i = 0; i < 4; ++i)
> + if (RepeatedMask[i] >= 8)
> + RepeatedMask[i] -= 4;
> + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
> + }
>
> - // Promote splats to a larger type which usually leads to more efficient code.
> - // FIXME: Is this true if pshufb is available?
> - if (SVOp->isSplat())
> - return PromoteSplat(SVOp, DAG);
> -
> - // If we have SSSE3, case 1 is generated when all result bytes come from
> - // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
> - // present, fall back to case 3.
> + // If we have a single input shuffle with different shuffle patterns in the
> + // two 128-bit lanes use the variable mask to VPERMILPS.
> + if (isSingleInputShuffleMask(Mask)) {
> + SDValue VPermMask[8];
> + for (int i = 0; i < 8; ++i)
> + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
> + : DAG.getConstant(Mask[i], MVT::i32);
> + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
> + return DAG.getNode(
> + X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
>
> - // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
> - if (Subtarget->hasSSSE3()) {
> - SmallVector<SDValue,16> pshufbMask;
> + if (Subtarget->hasAVX2())
> + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
> + DAG.getNode(ISD::BUILD_VECTOR, DL,
> + MVT::v8i32, VPermMask)),
> + V1);
>
> - // If all result elements are from one input vector, then only translate
> - // undef mask values to 0x80 (zero out result) in the pshufb mask.
> - //
> - // Otherwise, we have elements from both input vectors, and must zero out
> - // elements that come from V2 in the first mask, and V1 in the second mask
> - // so that we can OR them together.
> - for (unsigned i = 0; i != 16; ++i) {
> - int EltIdx = MaskVals[i];
> - if (EltIdx < 0 || EltIdx >= 16)
> - EltIdx = 0x80;
> - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
> - }
> - V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
> - DAG.getNode(ISD::BUILD_VECTOR, dl,
> - MVT::v16i8, pshufbMask));
> -
> - // As PSHUFB will zero elements with negative indices, it's safe to ignore
> - // the 2nd operand if it's undefined or zero.
> - if (V2.getOpcode() == ISD::UNDEF ||
> - ISD::isBuildVectorAllZeros(V2.getNode()))
> - return V1;
> + // Otherwise, fall back.
> + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
> + DAG);
> + }
>
> - // Calculate the shuffle mask for the second input, shuffle it, and
> - // OR it with the first shuffled input.
> - pshufbMask.clear();
> - for (unsigned i = 0; i != 16; ++i) {
> - int EltIdx = MaskVals[i];
> - EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
> - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
> - }
> - V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
> - DAG.getNode(ISD::BUILD_VECTOR, dl,
> - MVT::v16i8, pshufbMask));
> - return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
> - }
> -
> - // No SSSE3 - Calculate in place words and then fix all out of place words
> - // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
> - // the 16 different words that comprise the two doublequadword input vectors.
> - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
> - V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
> - SDValue NewV = V1;
> - for (int i = 0; i != 8; ++i) {
> - int Elt0 = MaskVals[i*2];
> - int Elt1 = MaskVals[i*2+1];
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle.
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
> + return Result;
>
> - // This word of the result is all undef, skip it.
> - if (Elt0 < 0 && Elt1 < 0)
> - continue;
> + // If we have AVX2 then we always want to lower with a blend because at v8 we
> + // can fully permute the elements.
> + if (Subtarget->hasAVX2())
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
> + Mask, DAG);
>
> - // This word of the result is already in the correct place, skip it.
> - if ((Elt0 == i*2) && (Elt1 == i*2+1))
> - continue;
> + // Otherwise fall back on generic lowering.
> + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
> +}
>
> - SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
> - SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
> - SDValue InsElt;
> -
> - // If Elt0 and Elt1 are defined, are consecutive, and can be load
> - // using a single extract together, load it and store it.
> - if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
> - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
> - DAG.getIntPtrConstant(Elt1 / 2));
> - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
> - DAG.getIntPtrConstant(i));
> - continue;
> - }
> +/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
> +///
> +/// This routine is only called when we have AVX2 and thus a reasonable
> +/// instruction set for v8i32 shuffling..
> +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
> + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
>
> - // If Elt1 is defined, extract it from the appropriate source. If the
> - // source byte is not also odd, shift the extracted word left 8 bits
> - // otherwise clear the bottom 8 bits if we need to do an or.
> - if (Elt1 >= 0) {
> - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
> - DAG.getIntPtrConstant(Elt1 / 2));
> - if ((Elt1 & 1) == 0)
> - InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
> - DAG.getConstant(8,
> - TLI.getShiftAmountTy(InsElt.getValueType())));
> - else if (Elt0 >= 0)
> - InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
> - DAG.getConstant(0xFF00, MVT::i16));
> - }
> - // If Elt0 is defined, extract it from the appropriate source. If the
> - // source byte is not also even, shift the extracted word right 8 bits. If
> - // Elt1 was also defined, OR the extracted values together before
> - // inserting them in the result.
> - if (Elt0 >= 0) {
> - SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
> - Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
> - if ((Elt0 & 1) != 0)
> - InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
> - DAG.getConstant(8,
> - TLI.getShiftAmountTy(InsElt0.getValueType())));
> - else if (Elt1 >= 0)
> - InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
> - DAG.getConstant(0x00FF, MVT::i16));
> - InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
> - : InsElt0;
> - }
> - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
> - DAG.getIntPtrConstant(i));
> - }
> - return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
> -}
> + // Whenever we can lower this as a zext, that instruction is strictly faster
> + // than any alternative. It also allows us to fold memory operands into the
> + // shuffle in many cases.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
> + Mask, Subtarget, DAG))
> + return ZExt;
>
> -// v32i8 shuffles - Translate to VPSHUFB if possible.
> -static
> -SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
> - const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - MVT VT = SVOp->getSimpleValueType(0);
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> - SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
> - bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
> - bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // VPSHUFB may be generated if
> - // (1) one of input vector is undefined or zeroinitializer.
> - // The mask value 0x80 puts 0 in the corresponding slot of the vector.
> - // And (2) the mask indexes don't cross the 128-bit lane.
> - if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
> - (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
> - return SDValue();
> + // If the shuffle mask is repeated in each 128-bit lane we can use more
> + // efficient instructions that mirror the shuffles across the two 128-bit
> + // lanes.
> + SmallVector<int, 4> RepeatedMask;
> + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
> + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
> + if (isSingleInputShuffleMask(Mask))
> + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
> + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
>
> - if (V1IsAllZero && !V2IsAllZero) {
> - CommuteVectorShuffleMask(MaskVals, 32);
> - V1 = V2;
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
> + if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
> }
> - return getPSHUFB(MaskVals, V1, dl, DAG);
> -}
>
> -/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
> -/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
> -/// done when every pair / quad of shuffle mask elements point to elements in
> -/// the right sequence. e.g.
> -/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
> -static
> -SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
> - SelectionDAG &DAG) {
> - MVT VT = SVOp->getSimpleValueType(0);
> - SDLoc dl(SVOp);
> - unsigned NumElems = VT.getVectorNumElements();
> - MVT NewVT;
> - unsigned Scale;
> - switch (VT.SimpleTy) {
> - default: llvm_unreachable("Unexpected!");
> - case MVT::v2i64:
> - case MVT::v2f64:
> - return SDValue(SVOp, 0);
> - case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
> - case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
> - case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
> - case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
> - case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
> - case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
> - }
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
> + return Shift;
>
> - SmallVector<int, 8> MaskVec;
> - for (unsigned i = 0; i != NumElems; i += Scale) {
> - int StartIdx = -1;
> - for (unsigned j = 0; j != Scale; ++j) {
> - int EltIdx = SVOp->getMaskElt(i+j);
> - if (EltIdx < 0)
> - continue;
> - if (StartIdx < 0)
> - StartIdx = (EltIdx / Scale);
> - if (EltIdx != (int)(StartIdx*Scale + j))
> - return SDValue();
> - }
> - MaskVec.push_back(StartIdx);
> - }
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> - SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
> - SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
> - return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
> -}
> -
> -/// getVZextMovL - Return a zero-extending vector move low node.
> -///
> -static SDValue getVZextMovL(MVT VT, MVT OpVT,
> - SDValue SrcOp, SelectionDAG &DAG,
> - const X86Subtarget *Subtarget, SDLoc dl) {
> - if (VT == MVT::v2f64 || VT == MVT::v4f32) {
> - LoadSDNode *LD = nullptr;
> - if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
> - LD = dyn_cast<LoadSDNode>(SrcOp);
> - if (!LD) {
> - // movssrr and movsdrr do not clear top bits. Try to use movd, movq
> - // instead.
> - MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
> - if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
> - SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
> - SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
> - SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
> - // PR2108
> - OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
> - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
> - OpVT,
> - SrcOp.getOperand(0)
> - .getOperand(0))));
> - }
> - }
> + // If the shuffle patterns aren't repeated but it is a single input, directly
> + // generate a cross-lane VPERMD instruction.
> + if (isSingleInputShuffleMask(Mask)) {
> + SDValue VPermMask[8];
> + for (int i = 0; i < 8; ++i)
> + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
> + : DAG.getConstant(Mask[i], MVT::i32);
> + return DAG.getNode(
> + X86ISD::VPERMV, DL, MVT::v8i32,
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
> }
>
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
> - DAG.getNode(ISD::BITCAST, dl,
> - OpVT, SrcOp)));
> -}
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle.
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
> + return Result;
>
> -/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
> -/// which could not be matched by any known target speficic shuffle
> -static SDValue
> -LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
> + // Otherwise fall back on generic blend lowering.
> + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
> + Mask, DAG);
> +}
>
> - SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> +/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
> +///
> +/// This routine is only called when we have AVX2 and thus a reasonable
> +/// instruction set for v16i16 shuffling..
> +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
> + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
>
> - MVT VT = SVOp->getSimpleValueType(0);
> + // Whenever we can lower this as a zext, that instruction is strictly faster
> + // than any alternative. It also allows us to fold memory operands into the
> + // shuffle in many cases.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
> + Mask, Subtarget, DAG))
> + return ZExt;
>
> - unsigned NumElems = VT.getVectorNumElements();
> - unsigned NumLaneElems = NumElems / 2;
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - SDLoc dl(SVOp);
> - MVT EltVT = VT.getVectorElementType();
> - MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
> - SDValue Output[2];
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - SmallVector<int, 16> Mask;
> - for (unsigned l = 0; l < 2; ++l) {
> - // Build a shuffle mask for the output, discovering on the fly which
> - // input vectors to use as shuffle operands (recorded in InputUsed).
> - // If building a suitable shuffle vector proves too hard, then bail
> - // out with UseBuildVector set.
> - bool UseBuildVector = false;
> - int InputUsed[2] = { -1, -1 }; // Not yet discovered.
> - unsigned LaneStart = l * NumLaneElems;
> - for (unsigned i = 0; i != NumLaneElems; ++i) {
> - // The mask element. This indexes into the input.
> - int Idx = SVOp->getMaskElt(i+LaneStart);
> - if (Idx < 0) {
> - // the mask element does not index into any input vector.
> - Mask.push_back(-1);
> - continue;
> - }
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask,
> + // First 128-bit lane:
> + 0, 16, 1, 17, 2, 18, 3, 19,
> + // Second 128-bit lane:
> + 8, 24, 9, 25, 10, 26, 11, 27))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask,
> + // First 128-bit lane:
> + 4, 20, 5, 21, 6, 22, 7, 23,
> + // Second 128-bit lane:
> + 12, 28, 13, 29, 14, 30, 15, 31))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
>
> - // The input vector this mask element indexes into.
> - int Input = Idx / NumLaneElems;
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
> + return Shift;
>
> - // Turn the index into an offset from the start of the input vector.
> - Idx -= Input * NumLaneElems;
> + // Try to use byte rotation instructions.
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> - // Find or create a shuffle vector operand to hold this input.
> - unsigned OpNo;
> - for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
> - if (InputUsed[OpNo] == Input)
> - // This input vector is already an operand.
> - break;
> - if (InputUsed[OpNo] < 0) {
> - // Create a new operand for this input vector.
> - InputUsed[OpNo] = Input;
> - break;
> - }
> - }
> + if (isSingleInputShuffleMask(Mask)) {
> + // There are no generalized cross-lane shuffle operations available on i16
> + // element types.
> + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
> + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
> + Mask, DAG);
>
> - if (OpNo >= array_lengthof(InputUsed)) {
> - // More than two input vectors used! Give up on trying to create a
> - // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
> - UseBuildVector = true;
> - break;
> + SDValue PSHUFBMask[32];
> + for (int i = 0; i < 16; ++i) {
> + if (Mask[i] == -1) {
> + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
> + continue;
> }
>
> - // Add the mask index for the new shuffle vector.
> - Mask.push_back(Idx + OpNo * NumLaneElems);
> - }
> -
> - if (UseBuildVector) {
> - SmallVector<SDValue, 16> SVOps;
> - for (unsigned i = 0; i != NumLaneElems; ++i) {
> - // The mask element. This indexes into the input.
> - int Idx = SVOp->getMaskElt(i+LaneStart);
> - if (Idx < 0) {
> - SVOps.push_back(DAG.getUNDEF(EltVT));
> - continue;
> - }
> -
> - // The input vector this mask element indexes into.
> - int Input = Idx / NumElems;
> -
> - // Turn the index into an offset from the start of the input vector.
> - Idx -= Input * NumElems;
> -
> - // Extract the vector element by hand.
> - SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
> - SVOp->getOperand(Input),
> - DAG.getIntPtrConstant(Idx)));
> - }
> -
> - // Construct the output using a BUILD_VECTOR.
> - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
> - } else if (InputUsed[0] < 0) {
> - // No input vectors were used! The result is undefined.
> - Output[l] = DAG.getUNDEF(NVT);
> - } else {
> - SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
> - (InputUsed[0] % 2) * NumLaneElems,
> - DAG, dl);
> - // If only one input was used, use an undefined vector for the other.
> - SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
> - Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
> - (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
> - // At least one input vector was used. Create a new shuffle vector.
> - Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
> + int M = i < 8 ? Mask[i] : Mask[i] - 8;
> + assert(M >= 0 && M < 8 && "Invalid single-input mask!");
> + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
> + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
> }
> -
> - Mask.clear();
> + return DAG.getNode(
> + ISD::BITCAST, DL, MVT::v16i16,
> + DAG.getNode(
> + X86ISD::PSHUFB, DL, MVT::v32i8,
> + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
> }
>
> - // Concatenate the result back
> - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle.
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
> + return Result;
> +
> + // Otherwise fall back on generic lowering.
> + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
> }
>
> -/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
> -/// 4 elements, and match them with several different shuffle types.
> -static SDValue
> -LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - SDLoc dl(SVOp);
> - MVT VT = SVOp->getSimpleValueType(0);
> +/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
> +///
> +/// This routine is only called when we have AVX2 and thus a reasonable
> +/// instruction set for v32i8 shuffling..
> +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
> + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
>
> - assert(VT.is128BitVector() && "Unsupported vector size");
> + // Whenever we can lower this as a zext, that instruction is strictly faster
> + // than any alternative. It also allows us to fold memory operands into the
> + // shuffle in many cases.
> + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
> + Mask, Subtarget, DAG))
> + return ZExt;
>
> - std::pair<int, int> Locs[4];
> - int Mask1[] = { -1, -1, -1, -1 };
> - SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
> -
> - unsigned NumHi = 0;
> - unsigned NumLo = 0;
> - for (unsigned i = 0; i != 4; ++i) {
> - int Idx = PermMask[i];
> - if (Idx < 0) {
> - Locs[i] = std::make_pair(-1, -1);
> - } else {
> - assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
> - if (Idx < 4) {
> - Locs[i] = std::make_pair(0, NumLo);
> - Mask1[NumLo] = Idx;
> - NumLo++;
> - } else {
> - Locs[i] = std::make_pair(1, NumHi);
> - if (2+NumHi < 4)
> - Mask1[2+NumHi] = Idx;
> - NumHi++;
> - }
> - }
> - }
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - if (NumLo <= 2 && NumHi <= 2) {
> - // If no more than two elements come from either vector. This can be
> - // implemented with two shuffles. First shuffle gather the elements.
> - // The second shuffle, which takes the first shuffle as both of its
> - // vector operands, put the elements into the right order.
> - V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
> + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
> + Subtarget, DAG))
> + return Blend;
>
> - int Mask2[] = { -1, -1, -1, -1 };
> + // Use dedicated unpack instructions for masks that match their pattern.
> + // Note that these are repeated 128-bit lane unpacks, not unpacks across all
> + // 256-bit lanes.
> + if (isShuffleEquivalent(
> + V1, V2, Mask,
> + // First 128-bit lane:
> + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
> + // Second 128-bit lane:
> + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
> + if (isShuffleEquivalent(
> + V1, V2, Mask,
> + // First 128-bit lane:
> + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
> + // Second 128-bit lane:
> + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
>
> - for (unsigned i = 0; i != 4; ++i)
> - if (Locs[i].first != -1) {
> - unsigned Idx = (i < 2) ? 0 : 4;
> - Idx += Locs[i].first * 2 + Locs[i].second;
> - Mask2[i] = Idx;
> - }
> + // Try to use shift instructions.
> + if (SDValue Shift =
> + lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
> + return Shift;
>
> - return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
> - }
> + // Try to use byte rotation instructions.
> + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
> + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
> + return Rotate;
>
> - if (NumLo == 3 || NumHi == 3) {
> - // Otherwise, we must have three elements from one vector, call it X, and
> - // one element from the other, call it Y. First, use a shufps to build an
> - // intermediate vector with the one element from Y and the element from X
> - // that will be in the same half in the final destination (the indexes don't
> - // matter). Then, use a shufps to build the final vector, taking the half
> - // containing the element from Y from the intermediate, and the other half
> - // from X.
> - if (NumHi == 3) {
> - // Normalize it so the 3 elements come from V1.
> - CommuteVectorShuffleMask(PermMask, 4);
> - std::swap(V1, V2);
> - }
> + if (isSingleInputShuffleMask(Mask)) {
> + // There are no generalized cross-lane shuffle operations available on i8
> + // element types.
> + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
> + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
> + Mask, DAG);
>
> - // Find the element from V2.
> - unsigned HiIndex;
> - for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
> - int Val = PermMask[HiIndex];
> - if (Val < 0)
> - continue;
> - if (Val >= 4)
> - break;
> - }
> + SDValue PSHUFBMask[32];
> + for (int i = 0; i < 32; ++i)
> + PSHUFBMask[i] =
> + Mask[i] < 0
> + ? DAG.getUNDEF(MVT::i8)
> + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
>
> - Mask1[0] = PermMask[HiIndex];
> - Mask1[1] = -1;
> - Mask1[2] = PermMask[HiIndex^1];
> - Mask1[3] = -1;
> - V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
> -
> - if (HiIndex >= 2) {
> - Mask1[0] = PermMask[0];
> - Mask1[1] = PermMask[1];
> - Mask1[2] = HiIndex & 1 ? 6 : 4;
> - Mask1[3] = HiIndex & 1 ? 4 : 6;
> - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
> - }
> -
> - Mask1[0] = HiIndex & 1 ? 2 : 0;
> - Mask1[1] = HiIndex & 1 ? 0 : 2;
> - Mask1[2] = PermMask[2];
> - Mask1[3] = PermMask[3];
> - if (Mask1[2] >= 0)
> - Mask1[2] += 4;
> - if (Mask1[3] >= 0)
> - Mask1[3] += 4;
> - return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
> - }
> -
> - // Break it into (shuffle shuffle_hi, shuffle_lo).
> - int LoMask[] = { -1, -1, -1, -1 };
> - int HiMask[] = { -1, -1, -1, -1 };
> -
> - int *MaskPtr = LoMask;
> - unsigned MaskIdx = 0;
> - unsigned LoIdx = 0;
> - unsigned HiIdx = 2;
> - for (unsigned i = 0; i != 4; ++i) {
> - if (i == 2) {
> - MaskPtr = HiMask;
> - MaskIdx = 1;
> - LoIdx = 0;
> - HiIdx = 2;
> - }
> - int Idx = PermMask[i];
> - if (Idx < 0) {
> - Locs[i] = std::make_pair(-1, -1);
> - } else if (Idx < 4) {
> - Locs[i] = std::make_pair(MaskIdx, LoIdx);
> - MaskPtr[LoIdx] = Idx;
> - LoIdx++;
> - } else {
> - Locs[i] = std::make_pair(MaskIdx, HiIdx);
> - MaskPtr[HiIdx] = Idx;
> - HiIdx++;
> - }
> + return DAG.getNode(
> + X86ISD::PSHUFB, DL, MVT::v32i8, V1,
> + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
> }
>
> - SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
> - SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
> - int MaskOps[] = { -1, -1, -1, -1 };
> - for (unsigned i = 0; i != 4; ++i)
> - if (Locs[i].first != -1)
> - MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
> - return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
> -}
> -
> -static bool MayFoldVectorLoad(SDValue V) {
> - while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
> - V = V.getOperand(0);
> -
> - if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
> - V = V.getOperand(0);
> - if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
> - V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
> - // BUILD_VECTOR (load), undef
> - V = V.getOperand(0);
> + // Try to simplify this by merging 128-bit lanes to enable a lane-based
> + // shuffle.
> + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
> + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
> + return Result;
>
> - return MayFoldLoad(V);
> + // Otherwise fall back on generic lowering.
> + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
> }
>
> -static
> -SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
> - MVT VT = Op.getSimpleValueType();
> -
> - // Canonicalize to v2f64.
> - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
> - V1, DAG));
> -}
> +/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
> +///
> +/// This routine either breaks down the specific type of a 256-bit x86 vector
> +/// shuffle or splits it into two 128-bit shuffles and fuses the results back
> +/// together based on the available instructions.
> +static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + MVT VT, const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
>
> -static
> -SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
> - bool HasSSE2) {
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - MVT VT = Op.getSimpleValueType();
> + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
> + // check for those subtargets here and avoid much of the subtarget querying in
> + // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
> + // ability to manipulate a 256-bit vector with integer types. Since we'll use
> + // floating point types there eventually, just immediately cast everything to
> + // a float and operate entirely in that domain.
> + if (VT.isInteger() && !Subtarget->hasAVX2()) {
> + int ElementBits = VT.getScalarSizeInBits();
> + if (ElementBits < 32)
> + // No floating point type available, decompose into 128-bit vectors.
> + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
>
> - assert(VT != MVT::v2i64 && "unsupported shuffle type");
> + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
> + VT.getVectorNumElements());
> + V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
> + return DAG.getNode(ISD::BITCAST, DL, VT,
> + DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
> + }
>
> - if (HasSSE2 && VT == MVT::v2f64)
> - return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
> + switch (VT.SimpleTy) {
> + case MVT::v4f64:
> + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v4i64:
> + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v8f32:
> + return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v8i32:
> + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v16i16:
> + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v32i8:
> + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
>
> - // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
> - return DAG.getNode(ISD::BITCAST, dl, VT,
> - getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
> - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
> - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
> + default:
> + llvm_unreachable("Not a valid 256-bit x86 vector type!");
> + }
> }
>
> -static
> -SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - MVT VT = Op.getSimpleValueType();
> -
> - assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
> - "unsupported shuffle type");
> +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
> +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
>
> - if (V2.getOpcode() == ISD::UNDEF)
> - V2 = V1;
> + // X86 has dedicated unpack instructions that can handle specific blend
> + // operations: UNPCKH and UNPCKL.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
>
> - // v4i32 or v4f32
> - return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
> }
>
> -static
> -SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - MVT VT = Op.getSimpleValueType();
> - unsigned NumElems = VT.getVectorNumElements();
> -
> - // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
> - // operand of these instructions is only memory, so check if there's a
> - // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
> - // same masks.
> - bool CanFoldLoad = false;
> -
> - // Trivial case, when V2 comes from a load.
> - if (MayFoldVectorLoad(V2))
> - CanFoldLoad = true;
> -
> - // When V1 is a load, it can be folded later into a store in isel, example:
> - // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
> - // turns into:
> - // (MOVLPSmr addr:$src1, VR128:$src2)
> - // So, recognize this potential and also use MOVLPS or MOVLPD
> - else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
> - CanFoldLoad = true;
> -
> +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
> +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
> ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - if (CanFoldLoad) {
> - if (HasSSE2 && NumElems == 2)
> - return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
> -
> - if (NumElems == 4)
> - // If we don't care about the second element, proceed to use movss.
> - if (SVOp->getMaskElt(1) != -1)
> - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
> - }
> -
> - // movl and movlp will both match v2i64, but v2i64 is never matched by
> - // movl earlier because we make it strict to avoid messing with the movlp load
> - // folding logic (see the code above getMOVLP call). Match it here then,
> - // this is horrible, but will stay like this until we move all shuffle
> - // matching to x86 specific nodes. Note that for the 1st condition all
> - // types are matched with movsd.
> - if (HasSSE2) {
> - // FIXME: isMOVLMask should be checked and matched before getMOVLP,
> - // as to remove this logic from here, as much as possible
> - if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
> - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
> - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
> - }
> -
> - assert(VT != MVT::v4i32 && "unsupported shuffle type");
> -
> - // Invert the operand order and use SHUFPS to match it.
> - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
> - getShuffleSHUFImmediate(SVOp), DAG);
> -}
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
>
> -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
> - SelectionDAG &DAG) {
> - SDLoc dl(Load);
> - MVT VT = Load->getSimpleValueType(0);
> - MVT EVT = VT.getVectorElementType();
> - SDValue Addr = Load->getOperand(1);
> - SDValue NewAddr = DAG.getNode(
> - ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
> - DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask,
> + 0, 16, 1, 17, 4, 20, 5, 21,
> + 8, 24, 9, 25, 12, 28, 13, 29))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask,
> + 2, 18, 3, 19, 6, 22, 7, 23,
> + 10, 26, 11, 27, 14, 30, 15, 31))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
>
> - SDValue NewLoad =
> - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
> - DAG.getMachineFunction().getMachineMemOperand(
> - Load->getMemOperand(), 0, EVT.getStoreSize()));
> - return NewLoad;
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
> }
>
> -// It is only safe to call this function if isINSERTPSMask is true for
> -// this shufflevector mask.
> -static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
> - SelectionDAG &DAG) {
> - // Generate an insertps instruction when inserting an f32 from memory onto a
> - // v4f32 or when copying a member from one v4f32 to another.
> - // We also use it for transferring i32 from one register to another,
> - // since it simply copies the same bits.
> - // If we're transferring an i32 from memory to a specific element in a
> - // register, we output a generic DAG that will match the PINSRD
> - // instruction.
> - MVT VT = SVOp->getSimpleValueType(0);
> - MVT EVT = VT.getVectorElementType();
> - SDValue V1 = SVOp->getOperand(0);
> - SDValue V2 = SVOp->getOperand(1);
> - auto Mask = SVOp->getMask();
> - assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
> - "unsupported vector type for insertps/pinsrd");
> -
> - auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
> - auto FromV2Predicate = [](const int &i) { return i >= 4; };
> - int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
> -
> - SDValue From;
> - SDValue To;
> - unsigned DestIndex;
> - if (FromV1 == 1) {
> - From = V1;
> - To = V2;
> - DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
> - Mask.begin();
> -
> - // If we have 1 element from each vector, we have to check if we're
> - // changing V1's element's place. If so, we're done. Otherwise, we
> - // should assume we're changing V2's element's place and behave
> - // accordingly.
> - int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
> - assert(DestIndex <= INT32_MAX && "truncated destination index");
> - if (FromV1 == FromV2 &&
> - static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
> - From = V2;
> - To = V1;
> - DestIndex =
> - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
> - }
> - } else {
> - assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
> - "More than one element from V1 and from V2, or no elements from one "
> - "of the vectors. This case should not have returned true from "
> - "isINSERTPSMask");
> - From = V2;
> - To = V1;
> - DestIndex =
> - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
> - }
> -
> - // Get an index into the source vector in the range [0,4) (the mask is
> - // in the range [0,8) because it can address V1 and V2)
> - unsigned SrcIndex = Mask[DestIndex] % 4;
> - if (MayFoldLoad(From)) {
> - // Trivial case, when From comes from a load and is only used by the
> - // shuffle. Make it use insertps from the vector that we need from that
> - // load.
> - SDValue NewLoad =
> - NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
> - if (!NewLoad.getNode())
> - return SDValue();
> +/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
> +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
>
> - if (EVT == MVT::f32) {
> - // Create this as a scalar to vector to match the instruction pattern.
> - SDValue LoadScalarToVector =
> - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
> - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
> - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
> - InsertpsMask);
> - } else { // EVT == MVT::i32
> - // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
> - // instruction, to match the PINSRD instruction, which loads an i32 to a
> - // certain vector element.
> - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
> - DAG.getConstant(DestIndex, MVT::i32));
> - }
> - }
> + // X86 has dedicated unpack instructions that can handle specific blend
> + // operations: UNPCKH and UNPCKL.
> + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
>
> - // Vector-element-to-vector
> - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
> - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
> }
>
> -// Reduce a vector shuffle to zext.
> -static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> - // PMOVZX is only available from SSE41.
> - if (!Subtarget->hasSSE41())
> - return SDValue();
> -
> - MVT VT = Op.getSimpleValueType();
> -
> - // Only AVX2 support 256-bit vector integer extending.
> - if (!Subtarget->hasInt256() && VT.is256BitVector())
> - return SDValue();
> -
> - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> +/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
> +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> SDLoc DL(Op);
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> - unsigned NumElems = VT.getVectorNumElements();
> + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
>
> - // Extending is an unary operation and the element type of the source vector
> - // won't be equal to or larger than i64.
> - if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
> - VT.getVectorElementType() == MVT::i64)
> - return SDValue();
> + // Use dedicated unpack instructions for masks that match their pattern.
> + if (isShuffleEquivalent(V1, V2, Mask,
> + 0, 16, 1, 17, 4, 20, 5, 21,
> + 8, 24, 9, 25, 12, 28, 13, 29))
> + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
> + if (isShuffleEquivalent(V1, V2, Mask,
> + 2, 18, 3, 19, 6, 22, 7, 23,
> + 10, 26, 11, 27, 14, 30, 15, 31))
> + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
>
> - // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
> - unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
> - while ((1U << Shift) < NumElems) {
> - if (SVOp->getMaskElt(1U << Shift) == 1)
> - break;
> - Shift += 1;
> - // The maximal ratio is 8, i.e. from i8 to i64.
> - if (Shift > 3)
> - return SDValue();
> - }
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
> +}
>
> - // Check the shuffle mask.
> - unsigned Mask = (1U << Shift) - 1;
> - for (unsigned i = 0; i != NumElems; ++i) {
> - int EltIdx = SVOp->getMaskElt(i);
> - if ((i & Mask) != 0 && EltIdx != -1)
> - return SDValue();
> - if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
> - return SDValue();
> - }
> +/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
> +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
> + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
>
> - unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
> - MVT NeVT = MVT::getIntegerVT(NBits);
> - MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
> +}
>
> - if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
> - return SDValue();
> +/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
> +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
> + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
>
> - return DAG.getNode(ISD::BITCAST, DL, VT,
> - DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
> + // FIXME: Implement direct support for this type!
> + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
> }
>
> -static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
> - SelectionDAG &DAG) {
> +/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
> +///
> +/// This routine either breaks down the specific type of a 512-bit x86 vector
> +/// shuffle or splits it into two 256-bit shuffles and fuses the results back
> +/// together based on the available instructions.
> +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + MVT VT, const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> - MVT VT = Op.getSimpleValueType();
> - SDLoc dl(Op);
> - SDValue V1 = Op.getOperand(0);
> - SDValue V2 = Op.getOperand(1);
> -
> - if (isZeroShuffle(SVOp))
> - return getZeroVector(VT, Subtarget, DAG, dl);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Subtarget->hasAVX512() &&
> + "Cannot lower 512-bit vectors w/ basic ISA!");
>
> - // Handle splat operations
> - if (SVOp->isSplat()) {
> - // Use vbroadcast whenever the splat comes from a foldable load
> - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
> - if (Broadcast.getNode())
> - return Broadcast;
> - }
> + // Check for being able to broadcast a single element.
> + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
> + Mask, Subtarget, DAG))
> + return Broadcast;
>
> - // Check integer expanding shuffles.
> - SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> + // Dispatch to each element type for lowering. If we don't have supprot for
> + // specific element type shuffles at 512 bits, immediately split them and
> + // lower them. Each lowering routine of a given type is allowed to assume that
> + // the requisite ISA extensions for that element type are available.
> + switch (VT.SimpleTy) {
> + case MVT::v8f64:
> + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v16f32:
> + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v8i64:
> + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v16i32:
> + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v32i16:
> + if (Subtarget->hasBWI())
> + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + break;
> + case MVT::v64i8:
> + if (Subtarget->hasBWI())
> + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + break;
>
> - // If the shuffle can be profitably rewritten as a narrower shuffle, then
> - // do it!
> - if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
> - VT == MVT::v32i8) {
> - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
> - if (NewOp.getNode())
> - return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
> - } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
> - // FIXME: Figure out a cleaner way to do this.
> - if (ISD::isBuildVectorAllZeros(V2.getNode())) {
> - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
> - if (NewOp.getNode()) {
> - MVT NewVT = NewOp.getSimpleValueType();
> - if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
> - NewVT, true, false))
> - return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
> - dl);
> - }
> - } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
> - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
> - if (NewOp.getNode()) {
> - MVT NewVT = NewOp.getSimpleValueType();
> - if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
> - return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
> - dl);
> - }
> - }
> + default:
> + llvm_unreachable("Not a valid 512-bit x86 vector type!");
> }
> - return SDValue();
> +
> + // Otherwise fall back on splitting.
> + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
> }
>
> -SDValue
> -X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
> +/// \brief Top-level lowering for x86 vector shuffles.
> +///
> +/// This handles decomposition, canonicalization, and lowering of all x86
> +/// vector shuffles. Most of the specific lowering strategies are encapsulated
> +/// above in helper routines. The canonicalization attempts to widen shuffles
> +/// to involve fewer lanes of wider elements, consolidate symmetric patterns
> +/// s.t. only one of the two inputs needs to be tested, etc.
> +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> SDValue V1 = Op.getOperand(0);
> SDValue V2 = Op.getOperand(1);
> MVT VT = Op.getSimpleValueType();
> + int NumElements = VT.getVectorNumElements();
> SDLoc dl(Op);
> - unsigned NumElems = VT.getVectorNumElements();
> - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
> - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
> - bool V1IsSplat = false;
> - bool V2IsSplat = false;
> - bool HasSSE2 = Subtarget->hasSSE2();
> - bool HasFp256 = Subtarget->hasFp256();
> - bool HasInt256 = Subtarget->hasInt256();
> - MachineFunction &MF = DAG.getMachineFunction();
> - bool OptForSize =
> - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
> -
> - // Check if we should use the experimental vector shuffle lowering. If so,
> - // delegate completely to that code path.
> - if (ExperimentalVectorShuffleLowering)
> - return lowerVectorShuffle(Op, Subtarget, DAG);
>
> assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
>
> + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
> + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
> if (V1IsUndef && V2IsUndef)
> return DAG.getUNDEF(VT);
>
> @@ -12665,322 +9941,111 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(S
> if (V1IsUndef)
> return DAG.getCommutedVectorShuffle(*SVOp);
>
> - // Vector shuffle lowering takes 3 steps:
> - //
> - // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
> - // narrowing and commutation of operands should be handled.
> - // 2) Matching of shuffles with known shuffle masks to x86 target specific
> - // shuffle nodes.
> - // 3) Rewriting of unmatched masks into new generic shuffle operations,
> - // so the shuffle can be broken into other shuffles and the legalizer can
> - // try the lowering again.
> - //
> - // The general idea is that no vector_shuffle operation should be left to
> - // be matched during isel, all of them must be converted to a target specific
> - // node here.
> -
> - // Normalize the input vectors. Here splats, zeroed vectors, profitable
> - // narrowing and commutation of operands should be handled. The actual code
> - // doesn't include all of those, work in progress...
> - SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> -
> - SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
> -
> - // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
> - // unpckh_undef). Only use pshufd if speed is more important than size.
> - if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
> - if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
> -
> - if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
> - V2IsUndef && MayFoldVectorLoad(V1))
> - return getMOVDDup(Op, dl, V1, DAG);
> -
> - if (isMOVHLPS_v_undef_Mask(M, VT))
> - return getMOVHighToLow(Op, dl, DAG);
> -
> - // Use to match splats
> - if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
> - (VT == MVT::v2f64 || VT == MVT::v2i64))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
> -
> - if (isPSHUFDMask(M, VT)) {
> - // The actual implementation will match the mask in the if above and then
> - // during isel it can match several different instructions, not only pshufd
> - // as its name says, sad but true, emulate the behavior for now...
> - if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
> - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
> -
> - unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
> -
> - if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
> - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
> -
> - if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
> - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
> - DAG);
> -
> - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
> - TargetMask, DAG);
> - }
> -
> - if (isPALIGNRMask(M, VT, Subtarget))
> - return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
> - getShufflePALIGNRImmediate(SVOp),
> - DAG);
> -
> - if (isVALIGNMask(M, VT, Subtarget))
> - return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
> - getShuffleVALIGNImmediate(SVOp),
> - DAG);
> -
> - // Check if this can be converted into a logical shift.
> - bool isLeft = false;
> - unsigned ShAmt = 0;
> - SDValue ShVal;
> - bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
> - if (isShift && ShVal.hasOneUse()) {
> - // If the shifted value has multiple uses, it may be cheaper to use
> - // v_set0 + movlhps or movhlps, etc.
> - MVT EltVT = VT.getVectorElementType();
> - ShAmt *= EltVT.getSizeInBits();
> - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
> - }
> + // Check for non-undef masks pointing at an undef vector and make the masks
> + // undef as well. This makes it easier to match the shuffle based solely on
> + // the mask.
> + if (V2IsUndef)
> + for (int M : Mask)
> + if (M >= NumElements) {
> + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
> + for (int &M : NewMask)
> + if (M >= NumElements)
> + M = -1;
> + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
> + }
>
> - if (isMOVLMask(M, VT)) {
> - if (ISD::isBuildVectorAllZeros(V1.getNode()))
> - return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
> - if (!isMOVLPMask(M, VT)) {
> - if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
> - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
> + // We actually see shuffles that are entirely re-arrangements of a set of
> + // zero inputs. This mostly happens while decomposing complex shuffles into
> + // simple ones. Directly lower these as a buildvector of zeros.
> + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
> + if (Zeroable.all())
> + return getZeroVector(VT, Subtarget, DAG, dl);
>
> - if (VT == MVT::v4i32 || VT == MVT::v4f32)
> - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
> + // Try to collapse shuffles into using a vector type with fewer elements but
> + // wider element types. We cap this to not form integers or floating point
> + // elements wider than 64 bits, but it might be interesting to form i128
> + // integers to handle flipping the low and high halves of AVX 256-bit vectors.
> + SmallVector<int, 16> WidenedMask;
> + if (VT.getScalarSizeInBits() < 64 &&
> + canWidenShuffleElements(Mask, WidenedMask)) {
> + MVT NewEltVT = VT.isFloatingPoint()
> + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
> + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
> + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
> + // Make sure that the new vector type is legal. For example, v2f64 isn't
> + // legal on SSE1.
> + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
> + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
> + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
> + return DAG.getNode(ISD::BITCAST, dl, VT,
> + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
> }
> }
>
> - // FIXME: fold these into legal mask.
> - if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
> - return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
> -
> - if (isMOVHLPSMask(M, VT))
> - return getMOVHighToLow(Op, dl, DAG);
> -
> - if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
> - return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
> -
> - if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
> - return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
> -
> - if (isMOVLPMask(M, VT))
> - return getMOVLP(Op, dl, DAG, HasSSE2);
> -
> - if (ShouldXformToMOVHLPS(M, VT) ||
> - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
> - return DAG.getCommutedVectorShuffle(*SVOp);
> -
> - if (isShift) {
> - // No better options. Use a vshldq / vsrldq.
> - MVT EltVT = VT.getVectorElementType();
> - ShAmt *= EltVT.getSizeInBits();
> - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
> - }
> -
> - bool Commuted = false;
> - // FIXME: This should also accept a bitcast of a splat? Be careful, not
> - // 1,1,1,1 -> v8i16 though.
> - BitVector UndefElements;
> - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
> - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
> - V1IsSplat = true;
> - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
> - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
> - V2IsSplat = true;
> -
> - // Canonicalize the splat or undef, if present, to be on the RHS.
> - if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
> - CommuteVectorShuffleMask(M, NumElems);
> - std::swap(V1, V2);
> - std::swap(V1IsSplat, V2IsSplat);
> - Commuted = true;
> - }
> -
> - if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
> - // Shuffling low element of v1 into undef, just return v1.
> - if (V2IsUndef)
> - return V1;
> - // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
> - // the instruction selector will not match, so get a canonical MOVL with
> - // swapped operands to undo the commute.
> - return getMOVL(DAG, dl, VT, V2, V1);
> - }
> -
> - if (isUNPCKLMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
> -
> - if (isUNPCKHMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
> -
> - if (V2IsSplat) {
> - // Normalize mask so all entries that point to V2 points to its first
> - // element then try to match unpck{h|l} again. If match, return a
> - // new vector_shuffle with the corrected mask.p
> - SmallVector<int, 8> NewMask(M.begin(), M.end());
> - NormalizeMask(NewMask, NumElems);
> - if (isUNPCKLMask(NewMask, VT, HasInt256, true))
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
> - if (isUNPCKHMask(NewMask, VT, HasInt256, true))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
> - }
> -
> - if (Commuted) {
> - // Commute is back and try unpck* again.
> - // FIXME: this seems wrong.
> - CommuteVectorShuffleMask(M, NumElems);
> - std::swap(V1, V2);
> - std::swap(V1IsSplat, V2IsSplat);
> -
> - if (isUNPCKLMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
> -
> - if (isUNPCKHMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
> - }
> + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
> + for (int M : SVOp->getMask())
> + if (M < 0)
> + ++NumUndefElements;
> + else if (M < NumElements)
> + ++NumV1Elements;
> + else
> + ++NumV2Elements;
>
> - // Normalize the node to match x86 shuffle ops if needed
> - if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
> + // Commute the shuffle as needed such that more elements come from V1 than
> + // V2. This allows us to match the shuffle pattern strictly on how many
> + // elements come from V1 without handling the symmetric cases.
> + if (NumV2Elements > NumV1Elements)
> return DAG.getCommutedVectorShuffle(*SVOp);
>
> - // The checks below are all present in isShuffleMaskLegal, but they are
> - // inlined here right now to enable us to directly emit target specific
> - // nodes, and remove one by one until they don't return Op anymore.
> -
> - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
> - SVOp->getSplatIndex() == 0 && V2IsUndef) {
> - if (VT == MVT::v2f64 || VT == MVT::v2i64)
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
> - }
> -
> - if (isPSHUFHWMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
> - getShufflePSHUFHWImmediate(SVOp),
> - DAG);
> -
> - if (isPSHUFLWMask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
> - getShufflePSHUFLWImmediate(SVOp),
> - DAG);
> -
> - unsigned MaskValue;
> - if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
> - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
> -
> - if (isSHUFPMask(M, VT))
> - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
> - getShuffleSHUFImmediate(SVOp), DAG);
> -
> - if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
> - if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
> - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
> -
> - //===--------------------------------------------------------------------===//
> - // Generate target specific nodes for 128 or 256-bit shuffles only
> - // supported in the AVX instruction set.
> - //
> -
> - // Handle VMOVDDUPY permutations
> - if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
> - return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
> -
> - // Handle VPERMILPS/D* permutations
> - if (isVPERMILPMask(M, VT)) {
> - if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
> - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
> - getShuffleSHUFImmediate(SVOp), DAG);
> - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
> - getShuffleSHUFImmediate(SVOp), DAG);
> - }
> -
> - unsigned Idx;
> - if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
> - return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
> - Idx*(NumElems/2), DAG, dl);
> -
> - // Handle VPERM2F128/VPERM2I128 permutations
> - if (isVPERM2X128Mask(M, VT, HasFp256))
> - return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
> - V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
> -
> - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
> - return getINSERTPS(SVOp, dl, DAG);
> -
> - unsigned Imm8;
> - if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
> - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
> -
> - if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
> - VT.is512BitVector()) {
> - MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
> - MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
> - SmallVector<SDValue, 16> permclMask;
> - for (unsigned i = 0; i != NumElems; ++i) {
> - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
> - }
> -
> - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
> - if (V2IsUndef)
> - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
> - return DAG.getNode(X86ISD::VPERMV, dl, VT,
> - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
> - return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
> - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
> - }
> -
> - //===--------------------------------------------------------------------===//
> - // Since no target specific shuffle was selected for this generic one,
> - // lower it into other known shuffles. FIXME: this isn't true yet, but
> - // this is the plan.
> - //
> -
> - // Handle v8i16 specifically since SSE can do byte extraction and insertion.
> - if (VT == MVT::v8i16) {
> - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> - }
> -
> - if (VT == MVT::v16i16 && HasInt256) {
> - SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> - }
> -
> - if (VT == MVT::v16i8) {
> - SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> + // When the number of V1 and V2 elements are the same, try to minimize the
> + // number of uses of V2 in the low half of the vector. When that is tied,
> + // ensure that the sum of indices for V1 is equal to or lower than the sum
> + // indices for V2. When those are equal, try to ensure that the number of odd
> + // indices for V1 is lower than the number of odd indices for V2.
> + if (NumV1Elements == NumV2Elements) {
> + int LowV1Elements = 0, LowV2Elements = 0;
> + for (int M : SVOp->getMask().slice(0, NumElements / 2))
> + if (M >= NumElements)
> + ++LowV2Elements;
> + else if (M >= 0)
> + ++LowV1Elements;
> + if (LowV2Elements > LowV1Elements) {
> + return DAG.getCommutedVectorShuffle(*SVOp);
> + } else if (LowV2Elements == LowV1Elements) {
> + int SumV1Indices = 0, SumV2Indices = 0;
> + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
> + if (SVOp->getMask()[i] >= NumElements)
> + SumV2Indices += i;
> + else if (SVOp->getMask()[i] >= 0)
> + SumV1Indices += i;
> + if (SumV2Indices < SumV1Indices) {
> + return DAG.getCommutedVectorShuffle(*SVOp);
> + } else if (SumV2Indices == SumV1Indices) {
> + int NumV1OddIndices = 0, NumV2OddIndices = 0;
> + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
> + if (SVOp->getMask()[i] >= NumElements)
> + NumV2OddIndices += i % 2;
> + else if (SVOp->getMask()[i] >= 0)
> + NumV1OddIndices += i % 2;
> + if (NumV2OddIndices < NumV1OddIndices)
> + return DAG.getCommutedVectorShuffle(*SVOp);
> + }
> + }
> }
>
> - if (VT == MVT::v32i8) {
> - SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
> - if (NewOp.getNode())
> - return NewOp;
> - }
> + // For each vector width, delegate to a specialized lowering routine.
> + if (VT.getSizeInBits() == 128)
> + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
>
> - // Handle all 128-bit wide vectors with 4 elements, and match them with
> - // several different shuffle types.
> - if (NumElems == 4 && VT.is128BitVector())
> - return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
> + if (VT.getSizeInBits() == 256)
> + return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
>
> - // Handle general 256-bit shuffles
> - if (VT.is256BitVector())
> - return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
> + // Force AVX-512 vectors to be scalarized for now.
> + // FIXME: Implement AVX-512 support!
> + if (VT.getSizeInBits() == 512)
> + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
>
> - return SDValue();
> + llvm_unreachable("Unimplemented!");
> }
>
> // This function assumes its argument is a BUILD_VECTOR of constants or
> @@ -19904,7 +16969,7 @@ SDValue X86TargetLowering::LowerOperatio
> case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
> case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
> case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
> - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
> + case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
> case ISD::VSELECT: return LowerVSELECT(Op, DAG);
> case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
> case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
> @@ -25921,6 +22986,23 @@ static SDValue PerformISDSETCCCombine(SD
> return SDValue();
> }
>
> +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
> + SelectionDAG &DAG) {
> + SDLoc dl(Load);
> + MVT VT = Load->getSimpleValueType(0);
> + MVT EVT = VT.getVectorElementType();
> + SDValue Addr = Load->getOperand(1);
> + SDValue NewAddr = DAG.getNode(
> + ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
> + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
> +
> + SDValue NewLoad =
> + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
> + DAG.getMachineFunction().getMachineMemOperand(
> + Load->getMemOperand(), 0, EVT.getStoreSize()));
> + return NewLoad;
> +}
> +
> static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
> const X86Subtarget *Subtarget) {
> SDLoc dl(N);
>
> Removed: llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll?rev=229963&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll (removed)
> @@ -1,27 +0,0 @@
> -; RUN: llc < %s -x86-experimental-vector-shuffle-lowering=false -mattr=+avx2 | FileCheck %s
> -
> -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> -target triple = "x86_64-apple-darwin"
> -
> -; PR21876
> -; The old shuffle lowering sometimes generates VZEXT nodes with both input
> -; and output same-sized types, here 256-bits. For instance, a v8i8 to v8i32
> -; zero-extend would become a (v8i32 (VZEXT v32i8)) node, which can't happen
> -; otherwise. The companion commit r223996 added those patterns temporarily.
> -; This test, along with the VR256 for AVX2 PMOVXrr instructions, should be
> -; removed once the old vector shuffle lowering goes away.
> -
> -define void @test_avx2_pmovx_256(<8 x i8>* %tmp64, <8 x float>* %tmp75) {
> -; CHECK-LABEL: test_avx2_pmovx_256
> -; We really don't care about the generated code.
> -; CHECK: vpmovzxbd
> -; CHECK: vcvtdq2ps
> -; CHECK: vmovups
> -; CHECK: vzeroupper
> -; CHECK: retq
> -
> - %wide.load458 = load <8 x i8>* %tmp64, align 1
> - %tmp68 = uitofp <8 x i8> %wide.load458 to <8 x float>
> - store <8 x float> %tmp68, <8 x float>* %tmp75, align 4
> - ret void
> -}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list