[llvm] r215702 - [x86] Add the initial skeleton of type-based dispatch for AVX vectors in
Adam Nemet
anemet at apple.com
Fri Aug 15 09:57:13 PDT 2014
On Aug 15, 2014, at 4:01 AM, Chandler Carruth <chandlerc at gmail.com> wrote:
> Author: chandlerc
> Date: Fri Aug 15 06:01:40 2014
> New Revision: 215702
>
> URL: http://llvm.org/viewvc/llvm-project?rev=215702&view=rev
> Log:
> [x86] Add the initial skeleton of type-based dispatch for AVX vectors in
> the new shuffle lowering and an implementation for v4 shuffles.
>
> This allows us to handle non-half-crossing shuffles directly for v4
> shuffles, both integer and floating point. This currently misses places
> where we could perform the blend via UNPCK instructions, but otherwise
> generates equally good or better code for the test cases included to the
> existing vector shuffle lowering. There are a few cases that are
> entertainingly better. ;]
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=215702&r1=215701&r2=215702&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Aug 15 06:01:40 2014
> @@ -8334,6 +8334,17 @@ static SDValue lower128BitVectorShuffle(
> }
> }
>
> +static bool isHalfCrossingShuffleMask(ArrayRef<int> Mask) {
> + int Size = Mask.size();
> + for (int M : Mask.slice(0, Size / 2))
> + if (M >= 0 && (M % Size) >= Size / 2)
> + return true;
> + for (int M : Mask.slice(Size / 2, Size / 2))
> + if (M >= 0 && (M % Size) < Size / 2)
> + return true;
> + return false;
> +}
That’s a cool little function but it won't be usable in AVX512 context. There too we have 128-bit lane cross and non-crossing instructions, e.g. unpack is just an extension of AVX with the same intralane logic. We should probably gravitate toward formulating these as 128-bit crossing rather than half crossing.
Adam
> +
> /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
> /// shuffles.
> ///
> @@ -8399,6 +8410,103 @@ static SDValue splitAndLower256BitVector
> return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
> }
>
> +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
> +///
> +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
> +/// isn't available.
> +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> +
> + // FIXME: If we have AVX2, we should delegate to generic code as crossing
> + // shuffles aren't a problem and FP and int have the same patterns.
> +
> + // FIXME: We can handle these more cleverly than splitting for v4f64.
> + if (isHalfCrossingShuffleMask(Mask))
> + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
> +
> + if (isSingleInputShuffleMask(Mask)) {
> + // Non-half-crossing single input shuffles can be lowerid with an
> + // interleaved permutation.
> + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
> + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
> + return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1,
> + DAG.getConstant(VPERMILPMask, MVT::i8));
> + }
> +
> + // Check if the blend happens to exactly fit that of SHUFPD.
> + if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) &&
> + Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) {
> + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
> + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
> + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
> + if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 &&
> + (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) {
> + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
> + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
> + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
> + DAG.getConstant(SHUFPDMask, MVT::i8));
> + }
> +
> + // Shuffle the input elements into the desired positions in V1 and V2 and
> + // blend them together.
> + int V1Mask[] = {-1, -1, -1, -1};
> + int V2Mask[] = {-1, -1, -1, -1};
> + for (int i = 0; i < 4; ++i)
> + if (Mask[i] >= 0 && Mask[i] < 4)
> + V1Mask[i] = Mask[i];
> + else if (Mask[i] >= 4)
> + V2Mask[i] = Mask[i] - 4;
> +
> + V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask);
> + V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask);
> +
> + unsigned BlendMask = 0;
> + for (int i = 0; i < 4; ++i)
> + if (Mask[i] >= 4)
> + BlendMask |= 1 << i;
> +
> + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2,
> + DAG.getConstant(BlendMask, MVT::i8));
> +}
> +
> +/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
> +///
> +/// Largely delegates to common code when we have AVX2 and to the floating-point
> +/// code when we only have AVX.
> +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDLoc DL(Op);
> + assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!");
> + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
> + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
> + ArrayRef<int> Mask = SVOp->getMask();
> + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
> +
> + // FIXME: If we have AVX2, we should delegate to generic code as crossing
> + // shuffles aren't a problem and FP and int have the same patterns.
> +
> + if (isHalfCrossingShuffleMask(Mask))
> + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
> +
> + // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and
> + // delegate to floating point code.
> + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1);
> + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2);
> + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64,
> + lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG));
> +}
> +
> /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
> ///
> /// This routine either breaks down the specific type of a 256-bit x86 vector
> @@ -8407,16 +8515,24 @@ static SDValue splitAndLower256BitVector
> static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
> MVT VT, const X86Subtarget *Subtarget,
> SelectionDAG &DAG) {
> - // FIXME: We should detect symmetric patterns and re-use the 128-bit shuffle
> - // lowering logic with wider types in that case.
> + switch (VT.SimpleTy) {
> + case MVT::v4f64:
> + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v4i64:
> + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
> + case MVT::v8i32:
> + case MVT::v8f32:
> + case MVT::v16i16:
> + case MVT::v32i8:
> + // Fall back to the basic pattern of extracting the high half and forming
> + // a 4-way blend.
> + // FIXME: Add targeted lowering for each type that can document rationale
> + // for delegating to this when necessary.
> + return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
>
> - // FIXME: We should detect when we can use AVX2 cross-half shuffles to either
> - // implement the shuffle completely, more effectively build symmetry, or
> - // minimize half-blends.
> -
> - // Fall back to the basic pattern of extracting the high half and forming
> - // a 4-way blend.
> - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
> + default:
> + llvm_unreachable("Not a valid 256-bit x86 vector type!");
> + }
> }
>
> /// \brief Tiny helper function to test whether a shuffle mask could be
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=215702&r1=215701&r2=215702&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Fri Aug 15 06:01:40 2014
> @@ -169,6 +169,89 @@ define <4 x double> @shuffle_v4f64_3210(
> %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
> ret <4 x double> %shuffle
> }
> +define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_0023
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_0022
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,2]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_1032
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,3,2]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_1133
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,1,3,3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_1023
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_1022
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[1,0,2,2]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_0423
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,{{[0-9],[0-9]}}]
> +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_0462
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm1 = ymm1[{{[0-9]}},0,2,{{[0-9]}}]
> +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,{{[0-9],[0-9]}},2]
> +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_0426
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
> + ret <4 x double> %shuffle
> +}
> +define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
> +; AVX1-LABEL: @shuffle_v4f64_5163
> +; AVX1: # BB#0:
> +; AVX1-NEXT: vshufpd {{.*}} # ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
> +; AVX1-NEXT: retq
> + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
> + ret <4 x double> %shuffle
> +}
>
> define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
> ; AVX1-LABEL: @shuffle_v4i64_0124
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list